From d31ca4fc8ede3b4a28abea7df57e01591ee8bb7d Mon Sep 17 00:00:00 2001 From: Lzu Tao Date: Fri, 4 Sep 2020 07:34:23 +0000 Subject: [PATCH 1/7] Move Utf8Error to new mod --- library/core/src/str/error.rs | 129 +++++++++++++++++++++++++++++++++ library/core/src/str/mod.rs | 131 ++-------------------------------- 2 files changed, 134 insertions(+), 126 deletions(-) create mode 100644 library/core/src/str/error.rs diff --git a/library/core/src/str/error.rs b/library/core/src/str/error.rs new file mode 100644 index 00000000000..43b790a4aca --- /dev/null +++ b/library/core/src/str/error.rs @@ -0,0 +1,129 @@ +//! Defines utf8 error type. + +use crate::fmt; + +/// Errors which can occur when attempting to interpret a sequence of [`u8`] +/// as a string. +/// +/// As such, the `from_utf8` family of functions and methods for both [`String`]s +/// and [`&str`]s make use of this error, for example. +/// +/// [`String`]: ../../std/string/struct.String.html#method.from_utf8 +/// [`&str`]: from_utf8 +/// +/// # Examples +/// +/// This error type’s methods can be used to create functionality +/// similar to `String::from_utf8_lossy` without allocating heap memory: +/// +/// ``` +/// fn from_utf8_lossy(mut input: &[u8], mut push: F) where F: FnMut(&str) { +/// loop { +/// match std::str::from_utf8(input) { +/// Ok(valid) => { +/// push(valid); +/// break +/// } +/// Err(error) => { +/// let (valid, after_valid) = input.split_at(error.valid_up_to()); +/// unsafe { +/// push(std::str::from_utf8_unchecked(valid)) +/// } +/// push("\u{FFFD}"); +/// +/// if let Some(invalid_sequence_length) = error.error_len() { +/// input = &after_valid[invalid_sequence_length..] +/// } else { +/// break +/// } +/// } +/// } +/// } +/// } +/// ``` +#[derive(Copy, Eq, PartialEq, Clone, Debug)] +#[stable(feature = "rust1", since = "1.0.0")] +pub struct Utf8Error { + pub(super) valid_up_to: usize, + pub(super) error_len: Option, +} + +impl Utf8Error { + /// Returns the index in the given string up to which valid UTF-8 was + /// verified. + /// + /// It is the maximum index such that `from_utf8(&input[..index])` + /// would return `Ok(_)`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::str; + /// + /// // some invalid bytes, in a vector + /// let sparkle_heart = vec![0, 159, 146, 150]; + /// + /// // std::str::from_utf8 returns a Utf8Error + /// let error = str::from_utf8(&sparkle_heart).unwrap_err(); + /// + /// // the second byte is invalid here + /// assert_eq!(1, error.valid_up_to()); + /// ``` + #[stable(feature = "utf8_error", since = "1.5.0")] + pub fn valid_up_to(&self) -> usize { + self.valid_up_to + } + + /// Provides more information about the failure: + /// + /// * `None`: the end of the input was reached unexpectedly. + /// `self.valid_up_to()` is 1 to 3 bytes from the end of the input. + /// If a byte stream (such as a file or a network socket) is being decoded incrementally, + /// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks. + /// + /// * `Some(len)`: an unexpected byte was encountered. + /// The length provided is that of the invalid byte sequence + /// that starts at the index given by `valid_up_to()`. + /// Decoding should resume after that sequence + /// (after inserting a [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD]) in case of + /// lossy decoding. + /// + /// [U+FFFD]: ../../std/char/constant.REPLACEMENT_CHARACTER.html + #[stable(feature = "utf8_error_error_len", since = "1.20.0")] + pub fn error_len(&self) -> Option { + self.error_len.map(|len| len as usize) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Display for Utf8Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(error_len) = self.error_len { + write!( + f, + "invalid utf-8 sequence of {} bytes from index {}", + error_len, self.valid_up_to + ) + } else { + write!(f, "incomplete utf-8 byte sequence from index {}", self.valid_up_to) + } + } +} + +/// An error returned when parsing a `bool` using [`from_str`] fails +/// +/// [`from_str`]: FromStr::from_str +#[derive(Debug, Clone, PartialEq, Eq)] +#[stable(feature = "rust1", since = "1.0.0")] +pub struct ParseBoolError { + pub(super) _priv: (), +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Display for ParseBoolError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + "provided string was not `true` or `false`".fmt(f) + } +} diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index e4a6b7e142a..b00d94f4858 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -8,6 +8,8 @@ #![stable(feature = "rust1", since = "1.0.0")] +mod error; + use self::pattern::Pattern; use self::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; @@ -27,6 +29,9 @@ pub mod pattern; #[allow(missing_docs)] pub mod lossy; +#[stable(feature = "rust1", since = "1.0.0")] +pub use error::{ParseBoolError, Utf8Error}; + /// Parse a value from a string /// /// `FromStr`'s [`from_str`] method is often used implicitly, through @@ -138,121 +143,10 @@ impl FromStr for bool { } } -/// An error returned when parsing a `bool` using [`from_str`] fails -/// -/// [`from_str`]: FromStr::from_str -#[derive(Debug, Clone, PartialEq, Eq)] -#[stable(feature = "rust1", since = "1.0.0")] -pub struct ParseBoolError { - _priv: (), -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl fmt::Display for ParseBoolError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - "provided string was not `true` or `false`".fmt(f) - } -} - /* Section: Creating a string */ -/// Errors which can occur when attempting to interpret a sequence of [`u8`] -/// as a string. -/// -/// As such, the `from_utf8` family of functions and methods for both [`String`]s -/// and [`&str`]s make use of this error, for example. -/// -/// [`String`]: ../../std/string/struct.String.html#method.from_utf8 -/// [`&str`]: from_utf8 -/// -/// # Examples -/// -/// This error type’s methods can be used to create functionality -/// similar to `String::from_utf8_lossy` without allocating heap memory: -/// -/// ``` -/// fn from_utf8_lossy(mut input: &[u8], mut push: F) where F: FnMut(&str) { -/// loop { -/// match std::str::from_utf8(input) { -/// Ok(valid) => { -/// push(valid); -/// break -/// } -/// Err(error) => { -/// let (valid, after_valid) = input.split_at(error.valid_up_to()); -/// unsafe { -/// push(std::str::from_utf8_unchecked(valid)) -/// } -/// push("\u{FFFD}"); -/// -/// if let Some(invalid_sequence_length) = error.error_len() { -/// input = &after_valid[invalid_sequence_length..] -/// } else { -/// break -/// } -/// } -/// } -/// } -/// } -/// ``` -#[derive(Copy, Eq, PartialEq, Clone, Debug)] -#[stable(feature = "rust1", since = "1.0.0")] -pub struct Utf8Error { - valid_up_to: usize, - error_len: Option, -} - -impl Utf8Error { - /// Returns the index in the given string up to which valid UTF-8 was - /// verified. - /// - /// It is the maximum index such that `from_utf8(&input[..index])` - /// would return `Ok(_)`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use std::str; - /// - /// // some invalid bytes, in a vector - /// let sparkle_heart = vec![0, 159, 146, 150]; - /// - /// // std::str::from_utf8 returns a Utf8Error - /// let error = str::from_utf8(&sparkle_heart).unwrap_err(); - /// - /// // the second byte is invalid here - /// assert_eq!(1, error.valid_up_to()); - /// ``` - #[stable(feature = "utf8_error", since = "1.5.0")] - pub fn valid_up_to(&self) -> usize { - self.valid_up_to - } - - /// Provides more information about the failure: - /// - /// * `None`: the end of the input was reached unexpectedly. - /// `self.valid_up_to()` is 1 to 3 bytes from the end of the input. - /// If a byte stream (such as a file or a network socket) is being decoded incrementally, - /// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks. - /// - /// * `Some(len)`: an unexpected byte was encountered. - /// The length provided is that of the invalid byte sequence - /// that starts at the index given by `valid_up_to()`. - /// Decoding should resume after that sequence - /// (after inserting a [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD]) in case of - /// lossy decoding. - /// - /// [U+FFFD]: ../../std/char/constant.REPLACEMENT_CHARACTER.html - #[stable(feature = "utf8_error_error_len", since = "1.20.0")] - pub fn error_len(&self) -> Option { - self.error_len.map(|len| len as usize) - } -} - /// Converts a slice of bytes to a string slice. /// /// A string slice ([`&str`]) is made of bytes ([`u8`]), and a byte slice @@ -440,21 +334,6 @@ pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str { unsafe { &mut *(v as *mut [u8] as *mut str) } } -#[stable(feature = "rust1", since = "1.0.0")] -impl fmt::Display for Utf8Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if let Some(error_len) = self.error_len { - write!( - f, - "invalid utf-8 sequence of {} bytes from index {}", - error_len, self.valid_up_to - ) - } else { - write!(f, "incomplete utf-8 byte sequence from index {}", self.valid_up_to) - } - } -} - /* Section: Iterators */ From 5b533fccf39501bd5ca8a436da6f2d8b5999664d Mon Sep 17 00:00:00 2001 From: Lzu Tao Date: Fri, 4 Sep 2020 07:10:05 +0000 Subject: [PATCH 2/7] Move traits implementation of str to new mod Also move FromStr trait --- library/core/src/str/mod.rs | 604 +-------------------------------- library/core/src/str/traits.rs | 597 ++++++++++++++++++++++++++++++++ 2 files changed, 599 insertions(+), 602 deletions(-) create mode 100644 library/core/src/str/traits.rs diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index b00d94f4858..147d341c8b0 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -9,6 +9,7 @@ #![stable(feature = "rust1", since = "1.0.0")] mod error; +mod traits; use self::pattern::Pattern; use self::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; @@ -32,116 +33,8 @@ pub mod lossy; #[stable(feature = "rust1", since = "1.0.0")] pub use error::{ParseBoolError, Utf8Error}; -/// Parse a value from a string -/// -/// `FromStr`'s [`from_str`] method is often used implicitly, through -/// [`str`]'s [`parse`] method. See [`parse`]'s documentation for examples. -/// -/// [`from_str`]: FromStr::from_str -/// [`parse`]: str::parse -/// -/// `FromStr` does not have a lifetime parameter, and so you can only parse types -/// that do not contain a lifetime parameter themselves. In other words, you can -/// parse an `i32` with `FromStr`, but not a `&i32`. You can parse a struct that -/// contains an `i32`, but not one that contains an `&i32`. -/// -/// # Examples -/// -/// Basic implementation of `FromStr` on an example `Point` type: -/// -/// ``` -/// use std::str::FromStr; -/// use std::num::ParseIntError; -/// -/// #[derive(Debug, PartialEq)] -/// struct Point { -/// x: i32, -/// y: i32 -/// } -/// -/// impl FromStr for Point { -/// type Err = ParseIntError; -/// -/// fn from_str(s: &str) -> Result { -/// let coords: Vec<&str> = s.trim_matches(|p| p == '(' || p == ')' ) -/// .split(',') -/// .collect(); -/// -/// let x_fromstr = coords[0].parse::()?; -/// let y_fromstr = coords[1].parse::()?; -/// -/// Ok(Point { x: x_fromstr, y: y_fromstr }) -/// } -/// } -/// -/// let p = Point::from_str("(1,2)"); -/// assert_eq!(p.unwrap(), Point{ x: 1, y: 2} ) -/// ``` #[stable(feature = "rust1", since = "1.0.0")] -pub trait FromStr: Sized { - /// The associated error which can be returned from parsing. - #[stable(feature = "rust1", since = "1.0.0")] - type Err; - - /// Parses a string `s` to return a value of this type. - /// - /// If parsing succeeds, return the value inside [`Ok`], otherwise - /// when the string is ill-formatted return an error specific to the - /// inside [`Err`]. The error type is specific to implementation of the trait. - /// - /// # Examples - /// - /// Basic usage with [`i32`][ithirtytwo], a type that implements `FromStr`: - /// - /// [ithirtytwo]: ../../std/primitive.i32.html - /// - /// ``` - /// use std::str::FromStr; - /// - /// let s = "5"; - /// let x = i32::from_str(s).unwrap(); - /// - /// assert_eq!(5, x); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - fn from_str(s: &str) -> Result; -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl FromStr for bool { - type Err = ParseBoolError; - - /// Parse a `bool` from a string. - /// - /// Yields a `Result`, because `s` may or may not - /// actually be parseable. - /// - /// # Examples - /// - /// ``` - /// use std::str::FromStr; - /// - /// assert_eq!(FromStr::from_str("true"), Ok(true)); - /// assert_eq!(FromStr::from_str("false"), Ok(false)); - /// assert!(::from_str("not even a boolean").is_err()); - /// ``` - /// - /// Note, in many cases, the `.parse()` method on `str` is more proper. - /// - /// ``` - /// assert_eq!("true".parse(), Ok(true)); - /// assert_eq!("false".parse(), Ok(false)); - /// assert!("not even a boolean".parse::().is_err()); - /// ``` - #[inline] - fn from_str(s: &str) -> Result { - match s { - "true" => Ok(true), - "false" => Ok(false), - _ => Err(ParseBoolError { _priv: () }), - } - } -} +pub use traits::FromStr; /* Section: Creating a string @@ -1586,499 +1479,6 @@ const CONT_MASK: u8 = 0b0011_1111; /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte. const TAG_CONT_U8: u8 = 0b1000_0000; -/* -Section: Trait implementations -*/ - -mod traits { - use crate::cmp::Ordering; - use crate::ops; - use crate::ptr; - use crate::slice::SliceIndex; - - /// Implements ordering of strings. - /// - /// Strings are ordered lexicographically by their byte values. This orders Unicode code - /// points based on their positions in the code charts. This is not necessarily the same as - /// "alphabetical" order, which varies by language and locale. Sorting strings according to - /// culturally-accepted standards requires locale-specific data that is outside the scope of - /// the `str` type. - #[stable(feature = "rust1", since = "1.0.0")] - impl Ord for str { - #[inline] - fn cmp(&self, other: &str) -> Ordering { - self.as_bytes().cmp(other.as_bytes()) - } - } - - #[stable(feature = "rust1", since = "1.0.0")] - impl PartialEq for str { - #[inline] - fn eq(&self, other: &str) -> bool { - self.as_bytes() == other.as_bytes() - } - #[inline] - fn ne(&self, other: &str) -> bool { - !(*self).eq(other) - } - } - - #[stable(feature = "rust1", since = "1.0.0")] - impl Eq for str {} - - /// Implements comparison operations on strings. - /// - /// Strings are compared lexicographically by their byte values. This compares Unicode code - /// points based on their positions in the code charts. This is not necessarily the same as - /// "alphabetical" order, which varies by language and locale. Comparing strings according to - /// culturally-accepted standards requires locale-specific data that is outside the scope of - /// the `str` type. - #[stable(feature = "rust1", since = "1.0.0")] - impl PartialOrd for str { - #[inline] - fn partial_cmp(&self, other: &str) -> Option { - Some(self.cmp(other)) - } - } - - #[stable(feature = "rust1", since = "1.0.0")] - impl ops::Index for str - where - I: SliceIndex, - { - type Output = I::Output; - - #[inline] - fn index(&self, index: I) -> &I::Output { - index.index(self) - } - } - - #[stable(feature = "rust1", since = "1.0.0")] - impl ops::IndexMut for str - where - I: SliceIndex, - { - #[inline] - fn index_mut(&mut self, index: I) -> &mut I::Output { - index.index_mut(self) - } - } - - #[inline(never)] - #[cold] - #[track_caller] - fn str_index_overflow_fail() -> ! { - panic!("attempted to index str up to maximum usize"); - } - - /// Implements substring slicing with syntax `&self[..]` or `&mut self[..]`. - /// - /// Returns a slice of the whole string, i.e., returns `&self` or `&mut - /// self`. Equivalent to `&self[0 .. len]` or `&mut self[0 .. len]`. Unlike - /// other indexing operations, this can never panic. - /// - /// This operation is `O(1)`. - /// - /// Prior to 1.20.0, these indexing operations were still supported by - /// direct implementation of `Index` and `IndexMut`. - /// - /// Equivalent to `&self[0 .. len]` or `&mut self[0 .. len]`. - #[stable(feature = "str_checked_slicing", since = "1.20.0")] - unsafe impl SliceIndex for ops::RangeFull { - type Output = str; - #[inline] - fn get(self, slice: &str) -> Option<&Self::Output> { - Some(slice) - } - #[inline] - fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { - Some(slice) - } - #[inline] - unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { - slice - } - #[inline] - unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { - slice - } - #[inline] - fn index(self, slice: &str) -> &Self::Output { - slice - } - #[inline] - fn index_mut(self, slice: &mut str) -> &mut Self::Output { - slice - } - } - - /// Implements substring slicing with syntax `&self[begin .. end]` or `&mut - /// self[begin .. end]`. - /// - /// Returns a slice of the given string from the byte range - /// [`begin`, `end`). - /// - /// This operation is `O(1)`. - /// - /// Prior to 1.20.0, these indexing operations were still supported by - /// direct implementation of `Index` and `IndexMut`. - /// - /// # Panics - /// - /// Panics if `begin` or `end` does not point to the starting byte offset of - /// a character (as defined by `is_char_boundary`), if `begin > end`, or if - /// `end > len`. - /// - /// # Examples - /// - /// ``` - /// let s = "Löwe 老虎 Léopard"; - /// assert_eq!(&s[0 .. 1], "L"); - /// - /// assert_eq!(&s[1 .. 9], "öwe 老"); - /// - /// // these will panic: - /// // byte 2 lies within `ö`: - /// // &s[2 ..3]; - /// - /// // byte 8 lies within `老` - /// // &s[1 .. 8]; - /// - /// // byte 100 is outside the string - /// // &s[3 .. 100]; - /// ``` - #[stable(feature = "str_checked_slicing", since = "1.20.0")] - unsafe impl SliceIndex for ops::Range { - type Output = str; - #[inline] - fn get(self, slice: &str) -> Option<&Self::Output> { - if self.start <= self.end - && slice.is_char_boundary(self.start) - && slice.is_char_boundary(self.end) - { - // SAFETY: just checked that `start` and `end` are on a char boundary, - // and we are passing in a safe reference, so the return value will also be one. - // We also checked char boundaries, so this is valid UTF-8. - Some(unsafe { &*self.get_unchecked(slice) }) - } else { - None - } - } - #[inline] - fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { - if self.start <= self.end - && slice.is_char_boundary(self.start) - && slice.is_char_boundary(self.end) - { - // SAFETY: just checked that `start` and `end` are on a char boundary. - // We know the pointer is unique because we got it from `slice`. - Some(unsafe { &mut *self.get_unchecked_mut(slice) }) - } else { - None - } - } - #[inline] - unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { - let slice = slice as *const [u8]; - // SAFETY: the caller guarantees that `self` is in bounds of `slice` - // which satisfies all the conditions for `add`. - let ptr = unsafe { slice.as_ptr().add(self.start) }; - let len = self.end - self.start; - ptr::slice_from_raw_parts(ptr, len) as *const str - } - #[inline] - unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { - let slice = slice as *mut [u8]; - // SAFETY: see comments for `get_unchecked`. - let ptr = unsafe { slice.as_mut_ptr().add(self.start) }; - let len = self.end - self.start; - ptr::slice_from_raw_parts_mut(ptr, len) as *mut str - } - #[inline] - fn index(self, slice: &str) -> &Self::Output { - let (start, end) = (self.start, self.end); - match self.get(slice) { - Some(s) => s, - None => super::slice_error_fail(slice, start, end), - } - } - #[inline] - fn index_mut(self, slice: &mut str) -> &mut Self::Output { - // is_char_boundary checks that the index is in [0, .len()] - // cannot reuse `get` as above, because of NLL trouble - if self.start <= self.end - && slice.is_char_boundary(self.start) - && slice.is_char_boundary(self.end) - { - // SAFETY: just checked that `start` and `end` are on a char boundary, - // and we are passing in a safe reference, so the return value will also be one. - unsafe { &mut *self.get_unchecked_mut(slice) } - } else { - super::slice_error_fail(slice, self.start, self.end) - } - } - } - - /// Implements substring slicing with syntax `&self[.. end]` or `&mut - /// self[.. end]`. - /// - /// Returns a slice of the given string from the byte range [`0`, `end`). - /// Equivalent to `&self[0 .. end]` or `&mut self[0 .. end]`. - /// - /// This operation is `O(1)`. - /// - /// Prior to 1.20.0, these indexing operations were still supported by - /// direct implementation of `Index` and `IndexMut`. - /// - /// # Panics - /// - /// Panics if `end` does not point to the starting byte offset of a - /// character (as defined by `is_char_boundary`), or if `end > len`. - #[stable(feature = "str_checked_slicing", since = "1.20.0")] - unsafe impl SliceIndex for ops::RangeTo { - type Output = str; - #[inline] - fn get(self, slice: &str) -> Option<&Self::Output> { - if slice.is_char_boundary(self.end) { - // SAFETY: just checked that `end` is on a char boundary, - // and we are passing in a safe reference, so the return value will also be one. - Some(unsafe { &*self.get_unchecked(slice) }) - } else { - None - } - } - #[inline] - fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { - if slice.is_char_boundary(self.end) { - // SAFETY: just checked that `end` is on a char boundary, - // and we are passing in a safe reference, so the return value will also be one. - Some(unsafe { &mut *self.get_unchecked_mut(slice) }) - } else { - None - } - } - #[inline] - unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { - let slice = slice as *const [u8]; - let ptr = slice.as_ptr(); - ptr::slice_from_raw_parts(ptr, self.end) as *const str - } - #[inline] - unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { - let slice = slice as *mut [u8]; - let ptr = slice.as_mut_ptr(); - ptr::slice_from_raw_parts_mut(ptr, self.end) as *mut str - } - #[inline] - fn index(self, slice: &str) -> &Self::Output { - let end = self.end; - match self.get(slice) { - Some(s) => s, - None => super::slice_error_fail(slice, 0, end), - } - } - #[inline] - fn index_mut(self, slice: &mut str) -> &mut Self::Output { - if slice.is_char_boundary(self.end) { - // SAFETY: just checked that `end` is on a char boundary, - // and we are passing in a safe reference, so the return value will also be one. - unsafe { &mut *self.get_unchecked_mut(slice) } - } else { - super::slice_error_fail(slice, 0, self.end) - } - } - } - - /// Implements substring slicing with syntax `&self[begin ..]` or `&mut - /// self[begin ..]`. - /// - /// Returns a slice of the given string from the byte range [`begin`, - /// `len`). Equivalent to `&self[begin .. len]` or `&mut self[begin .. - /// len]`. - /// - /// This operation is `O(1)`. - /// - /// Prior to 1.20.0, these indexing operations were still supported by - /// direct implementation of `Index` and `IndexMut`. - /// - /// # Panics - /// - /// Panics if `begin` does not point to the starting byte offset of - /// a character (as defined by `is_char_boundary`), or if `begin > len`. - #[stable(feature = "str_checked_slicing", since = "1.20.0")] - unsafe impl SliceIndex for ops::RangeFrom { - type Output = str; - #[inline] - fn get(self, slice: &str) -> Option<&Self::Output> { - if slice.is_char_boundary(self.start) { - // SAFETY: just checked that `start` is on a char boundary, - // and we are passing in a safe reference, so the return value will also be one. - Some(unsafe { &*self.get_unchecked(slice) }) - } else { - None - } - } - #[inline] - fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { - if slice.is_char_boundary(self.start) { - // SAFETY: just checked that `start` is on a char boundary, - // and we are passing in a safe reference, so the return value will also be one. - Some(unsafe { &mut *self.get_unchecked_mut(slice) }) - } else { - None - } - } - #[inline] - unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { - let slice = slice as *const [u8]; - // SAFETY: the caller guarantees that `self` is in bounds of `slice` - // which satisfies all the conditions for `add`. - let ptr = unsafe { slice.as_ptr().add(self.start) }; - let len = slice.len() - self.start; - ptr::slice_from_raw_parts(ptr, len) as *const str - } - #[inline] - unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { - let slice = slice as *mut [u8]; - // SAFETY: identical to `get_unchecked`. - let ptr = unsafe { slice.as_mut_ptr().add(self.start) }; - let len = slice.len() - self.start; - ptr::slice_from_raw_parts_mut(ptr, len) as *mut str - } - #[inline] - fn index(self, slice: &str) -> &Self::Output { - let (start, end) = (self.start, slice.len()); - match self.get(slice) { - Some(s) => s, - None => super::slice_error_fail(slice, start, end), - } - } - #[inline] - fn index_mut(self, slice: &mut str) -> &mut Self::Output { - if slice.is_char_boundary(self.start) { - // SAFETY: just checked that `start` is on a char boundary, - // and we are passing in a safe reference, so the return value will also be one. - unsafe { &mut *self.get_unchecked_mut(slice) } - } else { - super::slice_error_fail(slice, self.start, slice.len()) - } - } - } - - /// Implements substring slicing with syntax `&self[begin ..= end]` or `&mut - /// self[begin ..= end]`. - /// - /// Returns a slice of the given string from the byte range - /// [`begin`, `end`]. Equivalent to `&self [begin .. end + 1]` or `&mut - /// self[begin .. end + 1]`, except if `end` has the maximum value for - /// `usize`. - /// - /// This operation is `O(1)`. - /// - /// # Panics - /// - /// Panics if `begin` does not point to the starting byte offset of - /// a character (as defined by `is_char_boundary`), if `end` does not point - /// to the ending byte offset of a character (`end + 1` is either a starting - /// byte offset or equal to `len`), if `begin > end`, or if `end >= len`. - #[stable(feature = "inclusive_range", since = "1.26.0")] - unsafe impl SliceIndex for ops::RangeInclusive { - type Output = str; - #[inline] - fn get(self, slice: &str) -> Option<&Self::Output> { - if *self.end() == usize::MAX { - None - } else { - (*self.start()..self.end() + 1).get(slice) - } - } - #[inline] - fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { - if *self.end() == usize::MAX { - None - } else { - (*self.start()..self.end() + 1).get_mut(slice) - } - } - #[inline] - unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { - // SAFETY: the caller must uphold the safety contract for `get_unchecked`. - unsafe { (*self.start()..self.end() + 1).get_unchecked(slice) } - } - #[inline] - unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { - // SAFETY: the caller must uphold the safety contract for `get_unchecked_mut`. - unsafe { (*self.start()..self.end() + 1).get_unchecked_mut(slice) } - } - #[inline] - fn index(self, slice: &str) -> &Self::Output { - if *self.end() == usize::MAX { - str_index_overflow_fail(); - } - (*self.start()..self.end() + 1).index(slice) - } - #[inline] - fn index_mut(self, slice: &mut str) -> &mut Self::Output { - if *self.end() == usize::MAX { - str_index_overflow_fail(); - } - (*self.start()..self.end() + 1).index_mut(slice) - } - } - - /// Implements substring slicing with syntax `&self[..= end]` or `&mut - /// self[..= end]`. - /// - /// Returns a slice of the given string from the byte range [0, `end`]. - /// Equivalent to `&self [0 .. end + 1]`, except if `end` has the maximum - /// value for `usize`. - /// - /// This operation is `O(1)`. - /// - /// # Panics - /// - /// Panics if `end` does not point to the ending byte offset of a character - /// (`end + 1` is either a starting byte offset as defined by - /// `is_char_boundary`, or equal to `len`), or if `end >= len`. - #[stable(feature = "inclusive_range", since = "1.26.0")] - unsafe impl SliceIndex for ops::RangeToInclusive { - type Output = str; - #[inline] - fn get(self, slice: &str) -> Option<&Self::Output> { - if self.end == usize::MAX { None } else { (..self.end + 1).get(slice) } - } - #[inline] - fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { - if self.end == usize::MAX { None } else { (..self.end + 1).get_mut(slice) } - } - #[inline] - unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { - // SAFETY: the caller must uphold the safety contract for `get_unchecked`. - unsafe { (..self.end + 1).get_unchecked(slice) } - } - #[inline] - unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { - // SAFETY: the caller must uphold the safety contract for `get_unchecked_mut`. - unsafe { (..self.end + 1).get_unchecked_mut(slice) } - } - #[inline] - fn index(self, slice: &str) -> &Self::Output { - if self.end == usize::MAX { - str_index_overflow_fail(); - } - (..self.end + 1).index(slice) - } - #[inline] - fn index_mut(self, slice: &mut str) -> &mut Self::Output { - if self.end == usize::MAX { - str_index_overflow_fail(); - } - (..self.end + 1).index_mut(slice) - } - } -} - // truncate `&str` to length at most equal to `max` // return `true` if it were truncated, and the new str. fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) { diff --git a/library/core/src/str/traits.rs b/library/core/src/str/traits.rs new file mode 100644 index 00000000000..4f8aa246e52 --- /dev/null +++ b/library/core/src/str/traits.rs @@ -0,0 +1,597 @@ +//! Trait implementations for `str`. + +use crate::cmp::Ordering; +use crate::ops; +use crate::ptr; +use crate::slice::SliceIndex; + +use super::ParseBoolError; + +/// Implements ordering of strings. +/// +/// Strings are ordered lexicographically by their byte values. This orders Unicode code +/// points based on their positions in the code charts. This is not necessarily the same as +/// "alphabetical" order, which varies by language and locale. Sorting strings according to +/// culturally-accepted standards requires locale-specific data that is outside the scope of +/// the `str` type. +#[stable(feature = "rust1", since = "1.0.0")] +impl Ord for str { + #[inline] + fn cmp(&self, other: &str) -> Ordering { + self.as_bytes().cmp(other.as_bytes()) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialEq for str { + #[inline] + fn eq(&self, other: &str) -> bool { + self.as_bytes() == other.as_bytes() + } + #[inline] + fn ne(&self, other: &str) -> bool { + !(*self).eq(other) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Eq for str {} + +/// Implements comparison operations on strings. +/// +/// Strings are compared lexicographically by their byte values. This compares Unicode code +/// points based on their positions in the code charts. This is not necessarily the same as +/// "alphabetical" order, which varies by language and locale. Comparing strings according to +/// culturally-accepted standards requires locale-specific data that is outside the scope of +/// the `str` type. +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialOrd for str { + #[inline] + fn partial_cmp(&self, other: &str) -> Option { + Some(self.cmp(other)) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl ops::Index for str +where + I: SliceIndex, +{ + type Output = I::Output; + + #[inline] + fn index(&self, index: I) -> &I::Output { + index.index(self) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl ops::IndexMut for str +where + I: SliceIndex, +{ + #[inline] + fn index_mut(&mut self, index: I) -> &mut I::Output { + index.index_mut(self) + } +} + +#[inline(never)] +#[cold] +#[track_caller] +fn str_index_overflow_fail() -> ! { + panic!("attempted to index str up to maximum usize"); +} + +/// Implements substring slicing with syntax `&self[..]` or `&mut self[..]`. +/// +/// Returns a slice of the whole string, i.e., returns `&self` or `&mut +/// self`. Equivalent to `&self[0 .. len]` or `&mut self[0 .. len]`. Unlike +/// other indexing operations, this can never panic. +/// +/// This operation is `O(1)`. +/// +/// Prior to 1.20.0, these indexing operations were still supported by +/// direct implementation of `Index` and `IndexMut`. +/// +/// Equivalent to `&self[0 .. len]` or `&mut self[0 .. len]`. +#[stable(feature = "str_checked_slicing", since = "1.20.0")] +unsafe impl SliceIndex for ops::RangeFull { + type Output = str; + #[inline] + fn get(self, slice: &str) -> Option<&Self::Output> { + Some(slice) + } + #[inline] + fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { + Some(slice) + } + #[inline] + unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { + slice + } + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { + slice + } + #[inline] + fn index(self, slice: &str) -> &Self::Output { + slice + } + #[inline] + fn index_mut(self, slice: &mut str) -> &mut Self::Output { + slice + } +} + +/// Implements substring slicing with syntax `&self[begin .. end]` or `&mut +/// self[begin .. end]`. +/// +/// Returns a slice of the given string from the byte range +/// [`begin`, `end`). +/// +/// This operation is `O(1)`. +/// +/// Prior to 1.20.0, these indexing operations were still supported by +/// direct implementation of `Index` and `IndexMut`. +/// +/// # Panics +/// +/// Panics if `begin` or `end` does not point to the starting byte offset of +/// a character (as defined by `is_char_boundary`), if `begin > end`, or if +/// `end > len`. +/// +/// # Examples +/// +/// ``` +/// let s = "Löwe 老虎 Léopard"; +/// assert_eq!(&s[0 .. 1], "L"); +/// +/// assert_eq!(&s[1 .. 9], "öwe 老"); +/// +/// // these will panic: +/// // byte 2 lies within `ö`: +/// // &s[2 ..3]; +/// +/// // byte 8 lies within `老` +/// // &s[1 .. 8]; +/// +/// // byte 100 is outside the string +/// // &s[3 .. 100]; +/// ``` +#[stable(feature = "str_checked_slicing", since = "1.20.0")] +unsafe impl SliceIndex for ops::Range { + type Output = str; + #[inline] + fn get(self, slice: &str) -> Option<&Self::Output> { + if self.start <= self.end + && slice.is_char_boundary(self.start) + && slice.is_char_boundary(self.end) + { + // SAFETY: just checked that `start` and `end` are on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + // We also checked char boundaries, so this is valid UTF-8. + Some(unsafe { &*self.get_unchecked(slice) }) + } else { + None + } + } + #[inline] + fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { + if self.start <= self.end + && slice.is_char_boundary(self.start) + && slice.is_char_boundary(self.end) + { + // SAFETY: just checked that `start` and `end` are on a char boundary. + // We know the pointer is unique because we got it from `slice`. + Some(unsafe { &mut *self.get_unchecked_mut(slice) }) + } else { + None + } + } + #[inline] + unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { + let slice = slice as *const [u8]; + // SAFETY: the caller guarantees that `self` is in bounds of `slice` + // which satisfies all the conditions for `add`. + let ptr = unsafe { slice.as_ptr().add(self.start) }; + let len = self.end - self.start; + ptr::slice_from_raw_parts(ptr, len) as *const str + } + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { + let slice = slice as *mut [u8]; + // SAFETY: see comments for `get_unchecked`. + let ptr = unsafe { slice.as_mut_ptr().add(self.start) }; + let len = self.end - self.start; + ptr::slice_from_raw_parts_mut(ptr, len) as *mut str + } + #[inline] + fn index(self, slice: &str) -> &Self::Output { + let (start, end) = (self.start, self.end); + match self.get(slice) { + Some(s) => s, + None => super::slice_error_fail(slice, start, end), + } + } + #[inline] + fn index_mut(self, slice: &mut str) -> &mut Self::Output { + // is_char_boundary checks that the index is in [0, .len()] + // cannot reuse `get` as above, because of NLL trouble + if self.start <= self.end + && slice.is_char_boundary(self.start) + && slice.is_char_boundary(self.end) + { + // SAFETY: just checked that `start` and `end` are on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + unsafe { &mut *self.get_unchecked_mut(slice) } + } else { + super::slice_error_fail(slice, self.start, self.end) + } + } +} + +/// Implements substring slicing with syntax `&self[.. end]` or `&mut +/// self[.. end]`. +/// +/// Returns a slice of the given string from the byte range [`0`, `end`). +/// Equivalent to `&self[0 .. end]` or `&mut self[0 .. end]`. +/// +/// This operation is `O(1)`. +/// +/// Prior to 1.20.0, these indexing operations were still supported by +/// direct implementation of `Index` and `IndexMut`. +/// +/// # Panics +/// +/// Panics if `end` does not point to the starting byte offset of a +/// character (as defined by `is_char_boundary`), or if `end > len`. +#[stable(feature = "str_checked_slicing", since = "1.20.0")] +unsafe impl SliceIndex for ops::RangeTo { + type Output = str; + #[inline] + fn get(self, slice: &str) -> Option<&Self::Output> { + if slice.is_char_boundary(self.end) { + // SAFETY: just checked that `end` is on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + Some(unsafe { &*self.get_unchecked(slice) }) + } else { + None + } + } + #[inline] + fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { + if slice.is_char_boundary(self.end) { + // SAFETY: just checked that `end` is on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + Some(unsafe { &mut *self.get_unchecked_mut(slice) }) + } else { + None + } + } + #[inline] + unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { + let slice = slice as *const [u8]; + let ptr = slice.as_ptr(); + ptr::slice_from_raw_parts(ptr, self.end) as *const str + } + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { + let slice = slice as *mut [u8]; + let ptr = slice.as_mut_ptr(); + ptr::slice_from_raw_parts_mut(ptr, self.end) as *mut str + } + #[inline] + fn index(self, slice: &str) -> &Self::Output { + let end = self.end; + match self.get(slice) { + Some(s) => s, + None => super::slice_error_fail(slice, 0, end), + } + } + #[inline] + fn index_mut(self, slice: &mut str) -> &mut Self::Output { + if slice.is_char_boundary(self.end) { + // SAFETY: just checked that `end` is on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + unsafe { &mut *self.get_unchecked_mut(slice) } + } else { + super::slice_error_fail(slice, 0, self.end) + } + } +} + +/// Implements substring slicing with syntax `&self[begin ..]` or `&mut +/// self[begin ..]`. +/// +/// Returns a slice of the given string from the byte range [`begin`, +/// `len`). Equivalent to `&self[begin .. len]` or `&mut self[begin .. +/// len]`. +/// +/// This operation is `O(1)`. +/// +/// Prior to 1.20.0, these indexing operations were still supported by +/// direct implementation of `Index` and `IndexMut`. +/// +/// # Panics +/// +/// Panics if `begin` does not point to the starting byte offset of +/// a character (as defined by `is_char_boundary`), or if `begin > len`. +#[stable(feature = "str_checked_slicing", since = "1.20.0")] +unsafe impl SliceIndex for ops::RangeFrom { + type Output = str; + #[inline] + fn get(self, slice: &str) -> Option<&Self::Output> { + if slice.is_char_boundary(self.start) { + // SAFETY: just checked that `start` is on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + Some(unsafe { &*self.get_unchecked(slice) }) + } else { + None + } + } + #[inline] + fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { + if slice.is_char_boundary(self.start) { + // SAFETY: just checked that `start` is on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + Some(unsafe { &mut *self.get_unchecked_mut(slice) }) + } else { + None + } + } + #[inline] + unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { + let slice = slice as *const [u8]; + // SAFETY: the caller guarantees that `self` is in bounds of `slice` + // which satisfies all the conditions for `add`. + let ptr = unsafe { slice.as_ptr().add(self.start) }; + let len = slice.len() - self.start; + ptr::slice_from_raw_parts(ptr, len) as *const str + } + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { + let slice = slice as *mut [u8]; + // SAFETY: identical to `get_unchecked`. + let ptr = unsafe { slice.as_mut_ptr().add(self.start) }; + let len = slice.len() - self.start; + ptr::slice_from_raw_parts_mut(ptr, len) as *mut str + } + #[inline] + fn index(self, slice: &str) -> &Self::Output { + let (start, end) = (self.start, slice.len()); + match self.get(slice) { + Some(s) => s, + None => super::slice_error_fail(slice, start, end), + } + } + #[inline] + fn index_mut(self, slice: &mut str) -> &mut Self::Output { + if slice.is_char_boundary(self.start) { + // SAFETY: just checked that `start` is on a char boundary, + // and we are passing in a safe reference, so the return value will also be one. + unsafe { &mut *self.get_unchecked_mut(slice) } + } else { + super::slice_error_fail(slice, self.start, slice.len()) + } + } +} + +/// Implements substring slicing with syntax `&self[begin ..= end]` or `&mut +/// self[begin ..= end]`. +/// +/// Returns a slice of the given string from the byte range +/// [`begin`, `end`]. Equivalent to `&self [begin .. end + 1]` or `&mut +/// self[begin .. end + 1]`, except if `end` has the maximum value for +/// `usize`. +/// +/// This operation is `O(1)`. +/// +/// # Panics +/// +/// Panics if `begin` does not point to the starting byte offset of +/// a character (as defined by `is_char_boundary`), if `end` does not point +/// to the ending byte offset of a character (`end + 1` is either a starting +/// byte offset or equal to `len`), if `begin > end`, or if `end >= len`. +#[stable(feature = "inclusive_range", since = "1.26.0")] +unsafe impl SliceIndex for ops::RangeInclusive { + type Output = str; + #[inline] + fn get(self, slice: &str) -> Option<&Self::Output> { + if *self.end() == usize::MAX { None } else { (*self.start()..self.end() + 1).get(slice) } + } + #[inline] + fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { + if *self.end() == usize::MAX { + None + } else { + (*self.start()..self.end() + 1).get_mut(slice) + } + } + #[inline] + unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { + // SAFETY: the caller must uphold the safety contract for `get_unchecked`. + unsafe { (*self.start()..self.end() + 1).get_unchecked(slice) } + } + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { + // SAFETY: the caller must uphold the safety contract for `get_unchecked_mut`. + unsafe { (*self.start()..self.end() + 1).get_unchecked_mut(slice) } + } + #[inline] + fn index(self, slice: &str) -> &Self::Output { + if *self.end() == usize::MAX { + str_index_overflow_fail(); + } + (*self.start()..self.end() + 1).index(slice) + } + #[inline] + fn index_mut(self, slice: &mut str) -> &mut Self::Output { + if *self.end() == usize::MAX { + str_index_overflow_fail(); + } + (*self.start()..self.end() + 1).index_mut(slice) + } +} + +/// Implements substring slicing with syntax `&self[..= end]` or `&mut +/// self[..= end]`. +/// +/// Returns a slice of the given string from the byte range [0, `end`]. +/// Equivalent to `&self [0 .. end + 1]`, except if `end` has the maximum +/// value for `usize`. +/// +/// This operation is `O(1)`. +/// +/// # Panics +/// +/// Panics if `end` does not point to the ending byte offset of a character +/// (`end + 1` is either a starting byte offset as defined by +/// `is_char_boundary`, or equal to `len`), or if `end >= len`. +#[stable(feature = "inclusive_range", since = "1.26.0")] +unsafe impl SliceIndex for ops::RangeToInclusive { + type Output = str; + #[inline] + fn get(self, slice: &str) -> Option<&Self::Output> { + if self.end == usize::MAX { None } else { (..self.end + 1).get(slice) } + } + #[inline] + fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> { + if self.end == usize::MAX { None } else { (..self.end + 1).get_mut(slice) } + } + #[inline] + unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output { + // SAFETY: the caller must uphold the safety contract for `get_unchecked`. + unsafe { (..self.end + 1).get_unchecked(slice) } + } + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output { + // SAFETY: the caller must uphold the safety contract for `get_unchecked_mut`. + unsafe { (..self.end + 1).get_unchecked_mut(slice) } + } + #[inline] + fn index(self, slice: &str) -> &Self::Output { + if self.end == usize::MAX { + str_index_overflow_fail(); + } + (..self.end + 1).index(slice) + } + #[inline] + fn index_mut(self, slice: &mut str) -> &mut Self::Output { + if self.end == usize::MAX { + str_index_overflow_fail(); + } + (..self.end + 1).index_mut(slice) + } +} + +/// Parse a value from a string +/// +/// `FromStr`'s [`from_str`] method is often used implicitly, through +/// [`str`]'s [`parse`] method. See [`parse`]'s documentation for examples. +/// +/// [`from_str`]: FromStr::from_str +/// [`parse`]: str::parse +/// +/// `FromStr` does not have a lifetime parameter, and so you can only parse types +/// that do not contain a lifetime parameter themselves. In other words, you can +/// parse an `i32` with `FromStr`, but not a `&i32`. You can parse a struct that +/// contains an `i32`, but not one that contains an `&i32`. +/// +/// # Examples +/// +/// Basic implementation of `FromStr` on an example `Point` type: +/// +/// ``` +/// use std::str::FromStr; +/// use std::num::ParseIntError; +/// +/// #[derive(Debug, PartialEq)] +/// struct Point { +/// x: i32, +/// y: i32 +/// } +/// +/// impl FromStr for Point { +/// type Err = ParseIntError; +/// +/// fn from_str(s: &str) -> Result { +/// let coords: Vec<&str> = s.trim_matches(|p| p == '(' || p == ')' ) +/// .split(',') +/// .collect(); +/// +/// let x_fromstr = coords[0].parse::()?; +/// let y_fromstr = coords[1].parse::()?; +/// +/// Ok(Point { x: x_fromstr, y: y_fromstr }) +/// } +/// } +/// +/// let p = Point::from_str("(1,2)"); +/// assert_eq!(p.unwrap(), Point{ x: 1, y: 2} ) +/// ``` +#[stable(feature = "rust1", since = "1.0.0")] +pub trait FromStr: Sized { + /// The associated error which can be returned from parsing. + #[stable(feature = "rust1", since = "1.0.0")] + type Err; + + /// Parses a string `s` to return a value of this type. + /// + /// If parsing succeeds, return the value inside [`Ok`], otherwise + /// when the string is ill-formatted return an error specific to the + /// inside [`Err`]. The error type is specific to implementation of the trait. + /// + /// # Examples + /// + /// Basic usage with [`i32`][ithirtytwo], a type that implements `FromStr`: + /// + /// [ithirtytwo]: ../../std/primitive.i32.html + /// + /// ``` + /// use std::str::FromStr; + /// + /// let s = "5"; + /// let x = i32::from_str(s).unwrap(); + /// + /// assert_eq!(5, x); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + fn from_str(s: &str) -> Result; +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl FromStr for bool { + type Err = ParseBoolError; + + /// Parse a `bool` from a string. + /// + /// Yields a `Result`, because `s` may or may not + /// actually be parseable. + /// + /// # Examples + /// + /// ``` + /// use std::str::FromStr; + /// + /// assert_eq!(FromStr::from_str("true"), Ok(true)); + /// assert_eq!(FromStr::from_str("false"), Ok(false)); + /// assert!(::from_str("not even a boolean").is_err()); + /// ``` + /// + /// Note, in many cases, the `.parse()` method on `str` is more proper. + /// + /// ``` + /// assert_eq!("true".parse(), Ok(true)); + /// assert_eq!("false".parse(), Ok(false)); + /// assert!("not even a boolean".parse::().is_err()); + /// ``` + #[inline] + fn from_str(s: &str) -> Result { + match s { + "true" => Ok(true), + "false" => Ok(false), + _ => Err(ParseBoolError { _priv: () }), + } + } +} From 5f0d724e298ee4e19bef4e863c176dd87ef6400f Mon Sep 17 00:00:00 2001 From: Lzu Tao Date: Fri, 4 Sep 2020 08:46:47 +0000 Subject: [PATCH 3/7] Move str's impl of iterations to new mod --- library/core/src/str/iter.rs | 1255 +++++++++++++++++++++++++++++++++ library/core/src/str/mod.rs | 1283 +--------------------------------- 2 files changed, 1292 insertions(+), 1246 deletions(-) create mode 100644 library/core/src/str/iter.rs diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs new file mode 100644 index 00000000000..993df96a2d1 --- /dev/null +++ b/library/core/src/str/iter.rs @@ -0,0 +1,1255 @@ +//! Iterators for `str` methods. + +use crate::char; +use crate::fmt::{self, Write}; +use crate::iter::TrustedRandomAccess; +use crate::iter::{Chain, FlatMap, Flatten}; +use crate::iter::{Copied, Filter, FusedIterator, Map, TrustedLen}; +use crate::ops::Try; +use crate::option; +use crate::slice::{self, Split as SliceSplit}; + +use super::from_utf8_unchecked; +use super::pattern::Pattern; +use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; +use super::LinesAnyMap; +use super::{next_code_point, next_code_point_reverse, utf8_is_cont_byte}; +use super::{BytesIsNotEmpty, UnsafeBytesToStr}; +use super::{CharEscapeDebugContinue, CharEscapeDefault, CharEscapeUnicode}; +use super::{IsAsciiWhitespace, IsNotEmpty, IsWhitespace}; + +/// An iterator over the [`char`]s of a string slice. +/// +/// +/// This struct is created by the [`chars`] method on [`str`]. +/// See its documentation for more. +/// +/// [`char`]: prim@char +/// [`chars`]: str::chars +#[derive(Clone)] +#[stable(feature = "rust1", since = "1.0.0")] +pub struct Chars<'a> { + pub(super) iter: slice::Iter<'a, u8>, +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> Iterator for Chars<'a> { + type Item = char; + + #[inline] + fn next(&mut self) -> Option { + next_code_point(&mut self.iter).map(|ch| { + // SAFETY: `str` invariant says `ch` is a valid Unicode Scalar Value. + unsafe { char::from_u32_unchecked(ch) } + }) + } + + #[inline] + fn count(self) -> usize { + // length in `char` is equal to the number of non-continuation bytes + let bytes_len = self.iter.len(); + let mut cont_bytes = 0; + for &byte in self.iter { + cont_bytes += utf8_is_cont_byte(byte) as usize; + } + bytes_len - cont_bytes + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = self.iter.len(); + // `(len + 3)` can't overflow, because we know that the `slice::Iter` + // belongs to a slice in memory which has a maximum length of + // `isize::MAX` (that's well below `usize::MAX`). + ((len + 3) / 4, Some(len)) + } + + #[inline] + fn last(mut self) -> Option { + // No need to go through the entire string. + self.next_back() + } +} + +#[stable(feature = "chars_debug_impl", since = "1.38.0")] +impl fmt::Debug for Chars<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Chars(")?; + f.debug_list().entries(self.clone()).finish()?; + write!(f, ")")?; + Ok(()) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> DoubleEndedIterator for Chars<'a> { + #[inline] + fn next_back(&mut self) -> Option { + next_code_point_reverse(&mut self.iter).map(|ch| { + // SAFETY: `str` invariant says `ch` is a valid Unicode Scalar Value. + unsafe { char::from_u32_unchecked(ch) } + }) + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for Chars<'_> {} + +impl<'a> Chars<'a> { + /// Views the underlying data as a subslice of the original data. + /// + /// This has the same lifetime as the original slice, and so the + /// iterator can continue to be used while this exists. + /// + /// # Examples + /// + /// ``` + /// let mut chars = "abc".chars(); + /// + /// assert_eq!(chars.as_str(), "abc"); + /// chars.next(); + /// assert_eq!(chars.as_str(), "bc"); + /// chars.next(); + /// chars.next(); + /// assert_eq!(chars.as_str(), ""); + /// ``` + #[stable(feature = "iter_to_slice", since = "1.4.0")] + #[inline] + pub fn as_str(&self) -> &'a str { + // SAFETY: `Chars` is only made from a str, which guarantees the iter is valid UTF-8. + unsafe { from_utf8_unchecked(self.iter.as_slice()) } + } +} + +/// An iterator over the [`char`]s of a string slice, and their positions. +/// +/// This struct is created by the [`char_indices`] method on [`str`]. +/// See its documentation for more. +/// +/// [`char`]: prim@char +/// [`char_indices`]: str::char_indices +#[derive(Clone, Debug)] +#[stable(feature = "rust1", since = "1.0.0")] +pub struct CharIndices<'a> { + pub(super) front_offset: usize, + pub(super) iter: Chars<'a>, +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> Iterator for CharIndices<'a> { + type Item = (usize, char); + + #[inline] + fn next(&mut self) -> Option<(usize, char)> { + let pre_len = self.iter.iter.len(); + match self.iter.next() { + None => None, + Some(ch) => { + let index = self.front_offset; + let len = self.iter.iter.len(); + self.front_offset += pre_len - len; + Some((index, ch)) + } + } + } + + #[inline] + fn count(self) -> usize { + self.iter.count() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } + + #[inline] + fn last(mut self) -> Option<(usize, char)> { + // No need to go through the entire string. + self.next_back() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> DoubleEndedIterator for CharIndices<'a> { + #[inline] + fn next_back(&mut self) -> Option<(usize, char)> { + self.iter.next_back().map(|ch| { + let index = self.front_offset + self.iter.iter.len(); + (index, ch) + }) + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for CharIndices<'_> {} + +impl<'a> CharIndices<'a> { + /// Views the underlying data as a subslice of the original data. + /// + /// This has the same lifetime as the original slice, and so the + /// iterator can continue to be used while this exists. + #[stable(feature = "iter_to_slice", since = "1.4.0")] + #[inline] + pub fn as_str(&self) -> &'a str { + self.iter.as_str() + } +} + +/// An iterator over the bytes of a string slice. +/// +/// This struct is created by the [`bytes`] method on [`str`]. +/// See its documentation for more. +/// +/// [`bytes`]: str::bytes +#[stable(feature = "rust1", since = "1.0.0")] +#[derive(Clone, Debug)] +pub struct Bytes<'a>(pub(super) Copied>); + +#[stable(feature = "rust1", since = "1.0.0")] +impl Iterator for Bytes<'_> { + type Item = u8; + + #[inline] + fn next(&mut self) -> Option { + self.0.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } + + #[inline] + fn count(self) -> usize { + self.0.count() + } + + #[inline] + fn last(self) -> Option { + self.0.last() + } + + #[inline] + fn nth(&mut self, n: usize) -> Option { + self.0.nth(n) + } + + #[inline] + fn all(&mut self, f: F) -> bool + where + F: FnMut(Self::Item) -> bool, + { + self.0.all(f) + } + + #[inline] + fn any(&mut self, f: F) -> bool + where + F: FnMut(Self::Item) -> bool, + { + self.0.any(f) + } + + #[inline] + fn find

(&mut self, predicate: P) -> Option + where + P: FnMut(&Self::Item) -> bool, + { + self.0.find(predicate) + } + + #[inline] + fn position

(&mut self, predicate: P) -> Option + where + P: FnMut(Self::Item) -> bool, + { + self.0.position(predicate) + } + + #[inline] + fn rposition

(&mut self, predicate: P) -> Option + where + P: FnMut(Self::Item) -> bool, + { + self.0.rposition(predicate) + } + + #[inline] + unsafe fn __iterator_get_unchecked(&mut self, idx: usize) -> u8 { + // SAFETY: the caller must uphold the safety contract + // for `Iterator::__iterator_get_unchecked`. + unsafe { self.0.__iterator_get_unchecked(idx) } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl DoubleEndedIterator for Bytes<'_> { + #[inline] + fn next_back(&mut self) -> Option { + self.0.next_back() + } + + #[inline] + fn nth_back(&mut self, n: usize) -> Option { + self.0.nth_back(n) + } + + #[inline] + fn rfind

(&mut self, predicate: P) -> Option + where + P: FnMut(&Self::Item) -> bool, + { + self.0.rfind(predicate) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl ExactSizeIterator for Bytes<'_> { + #[inline] + fn len(&self) -> usize { + self.0.len() + } + + #[inline] + fn is_empty(&self) -> bool { + self.0.is_empty() + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for Bytes<'_> {} + +#[unstable(feature = "trusted_len", issue = "37572")] +unsafe impl TrustedLen for Bytes<'_> {} + +#[doc(hidden)] +#[unstable(feature = "trusted_random_access", issue = "none")] +unsafe impl TrustedRandomAccess for Bytes<'_> { + fn may_have_side_effect() -> bool { + false + } +} + +/// This macro generates a Clone impl for string pattern API +/// wrapper types of the form X<'a, P> +macro_rules! derive_pattern_clone { + (clone $t:ident with |$s:ident| $e:expr) => { + impl<'a, P> Clone for $t<'a, P> + where + P: Pattern<'a, Searcher: Clone>, + { + fn clone(&self) -> Self { + let $s = self; + $e + } + } + }; +} + +/// This macro generates two public iterator structs +/// wrapping a private internal one that makes use of the `Pattern` API. +/// +/// For all patterns `P: Pattern<'a>` the following items will be +/// generated (generics omitted): +/// +/// struct $forward_iterator($internal_iterator); +/// struct $reverse_iterator($internal_iterator); +/// +/// impl Iterator for $forward_iterator +/// { /* internal ends up calling Searcher::next_match() */ } +/// +/// impl DoubleEndedIterator for $forward_iterator +/// where P::Searcher: DoubleEndedSearcher +/// { /* internal ends up calling Searcher::next_match_back() */ } +/// +/// impl Iterator for $reverse_iterator +/// where P::Searcher: ReverseSearcher +/// { /* internal ends up calling Searcher::next_match_back() */ } +/// +/// impl DoubleEndedIterator for $reverse_iterator +/// where P::Searcher: DoubleEndedSearcher +/// { /* internal ends up calling Searcher::next_match() */ } +/// +/// The internal one is defined outside the macro, and has almost the same +/// semantic as a DoubleEndedIterator by delegating to `pattern::Searcher` and +/// `pattern::ReverseSearcher` for both forward and reverse iteration. +/// +/// "Almost", because a `Searcher` and a `ReverseSearcher` for a given +/// `Pattern` might not return the same elements, so actually implementing +/// `DoubleEndedIterator` for it would be incorrect. +/// (See the docs in `str::pattern` for more details) +/// +/// However, the internal struct still represents a single ended iterator from +/// either end, and depending on pattern is also a valid double ended iterator, +/// so the two wrapper structs implement `Iterator` +/// and `DoubleEndedIterator` depending on the concrete pattern type, leading +/// to the complex impls seen above. +macro_rules! generate_pattern_iterators { + { + // Forward iterator + forward: + $(#[$forward_iterator_attribute:meta])* + struct $forward_iterator:ident; + + // Reverse iterator + reverse: + $(#[$reverse_iterator_attribute:meta])* + struct $reverse_iterator:ident; + + // Stability of all generated items + stability: + $(#[$common_stability_attribute:meta])* + + // Internal almost-iterator that is being delegated to + internal: + $internal_iterator:ident yielding ($iterty:ty); + + // Kind of delegation - either single ended or double ended + delegate $($t:tt)* + } => { + $(#[$forward_iterator_attribute])* + $(#[$common_stability_attribute])* + pub struct $forward_iterator<'a, P: Pattern<'a>>(pub(super) $internal_iterator<'a, P>); + + $(#[$common_stability_attribute])* + impl<'a, P> fmt::Debug for $forward_iterator<'a, P> + where + P: Pattern<'a, Searcher: fmt::Debug>, + { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple(stringify!($forward_iterator)) + .field(&self.0) + .finish() + } + } + + $(#[$common_stability_attribute])* + impl<'a, P: Pattern<'a>> Iterator for $forward_iterator<'a, P> { + type Item = $iterty; + + #[inline] + fn next(&mut self) -> Option<$iterty> { + self.0.next() + } + } + + $(#[$common_stability_attribute])* + impl<'a, P> Clone for $forward_iterator<'a, P> + where + P: Pattern<'a, Searcher: Clone>, + { + fn clone(&self) -> Self { + $forward_iterator(self.0.clone()) + } + } + + $(#[$reverse_iterator_attribute])* + $(#[$common_stability_attribute])* + pub struct $reverse_iterator<'a, P: Pattern<'a>>(pub(super) $internal_iterator<'a, P>); + + $(#[$common_stability_attribute])* + impl<'a, P> fmt::Debug for $reverse_iterator<'a, P> + where + P: Pattern<'a, Searcher: fmt::Debug>, + { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple(stringify!($reverse_iterator)) + .field(&self.0) + .finish() + } + } + + $(#[$common_stability_attribute])* + impl<'a, P> Iterator for $reverse_iterator<'a, P> + where + P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + { + type Item = $iterty; + + #[inline] + fn next(&mut self) -> Option<$iterty> { + self.0.next_back() + } + } + + $(#[$common_stability_attribute])* + impl<'a, P> Clone for $reverse_iterator<'a, P> + where + P: Pattern<'a, Searcher: Clone>, + { + fn clone(&self) -> Self { + $reverse_iterator(self.0.clone()) + } + } + + #[stable(feature = "fused", since = "1.26.0")] + impl<'a, P: Pattern<'a>> FusedIterator for $forward_iterator<'a, P> {} + + #[stable(feature = "fused", since = "1.26.0")] + impl<'a, P> FusedIterator for $reverse_iterator<'a, P> + where + P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + {} + + generate_pattern_iterators!($($t)* with $(#[$common_stability_attribute])*, + $forward_iterator, + $reverse_iterator, $iterty); + }; + { + double ended; with $(#[$common_stability_attribute:meta])*, + $forward_iterator:ident, + $reverse_iterator:ident, $iterty:ty + } => { + $(#[$common_stability_attribute])* + impl<'a, P> DoubleEndedIterator for $forward_iterator<'a, P> + where + P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, + { + #[inline] + fn next_back(&mut self) -> Option<$iterty> { + self.0.next_back() + } + } + + $(#[$common_stability_attribute])* + impl<'a, P> DoubleEndedIterator for $reverse_iterator<'a, P> + where + P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, + { + #[inline] + fn next_back(&mut self) -> Option<$iterty> { + self.0.next() + } + } + }; + { + single ended; with $(#[$common_stability_attribute:meta])*, + $forward_iterator:ident, + $reverse_iterator:ident, $iterty:ty + } => {} +} + +derive_pattern_clone! { + clone SplitInternal + with |s| SplitInternal { matcher: s.matcher.clone(), ..*s } +} + +pub(super) struct SplitInternal<'a, P: Pattern<'a>> { + pub(super) start: usize, + pub(super) end: usize, + pub(super) matcher: P::Searcher, + pub(super) allow_trailing_empty: bool, + pub(super) finished: bool, +} + +impl<'a, P> fmt::Debug for SplitInternal<'a, P> +where + P: Pattern<'a, Searcher: fmt::Debug>, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SplitInternal") + .field("start", &self.start) + .field("end", &self.end) + .field("matcher", &self.matcher) + .field("allow_trailing_empty", &self.allow_trailing_empty) + .field("finished", &self.finished) + .finish() + } +} + +impl<'a, P: Pattern<'a>> SplitInternal<'a, P> { + #[inline] + fn get_end(&mut self) -> Option<&'a str> { + if !self.finished && (self.allow_trailing_empty || self.end - self.start > 0) { + self.finished = true; + // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. + unsafe { + let string = self.matcher.haystack().get_unchecked(self.start..self.end); + Some(string) + } + } else { + None + } + } + + #[inline] + fn next(&mut self) -> Option<&'a str> { + if self.finished { + return None; + } + + let haystack = self.matcher.haystack(); + match self.matcher.next_match() { + // SAFETY: `Searcher` guarantees that `a` and `b` lie on unicode boundaries. + Some((a, b)) => unsafe { + let elt = haystack.get_unchecked(self.start..a); + self.start = b; + Some(elt) + }, + None => self.get_end(), + } + } + + #[inline] + fn next_inclusive(&mut self) -> Option<&'a str> { + if self.finished { + return None; + } + + let haystack = self.matcher.haystack(); + match self.matcher.next_match() { + // SAFETY: `Searcher` guarantees that `b` lies on unicode boundary, + // and self.start is either the start of the original string, + // or `b` was assigned to it, so it also lies on unicode boundary. + Some((_, b)) => unsafe { + let elt = haystack.get_unchecked(self.start..b); + self.start = b; + Some(elt) + }, + None => self.get_end(), + } + } + + #[inline] + fn next_back(&mut self) -> Option<&'a str> + where + P::Searcher: ReverseSearcher<'a>, + { + if self.finished { + return None; + } + + if !self.allow_trailing_empty { + self.allow_trailing_empty = true; + match self.next_back() { + Some(elt) if !elt.is_empty() => return Some(elt), + _ => { + if self.finished { + return None; + } + } + } + } + + let haystack = self.matcher.haystack(); + match self.matcher.next_match_back() { + // SAFETY: `Searcher` guarantees that `a` and `b` lie on unicode boundaries. + Some((a, b)) => unsafe { + let elt = haystack.get_unchecked(b..self.end); + self.end = a; + Some(elt) + }, + // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. + None => unsafe { + self.finished = true; + Some(haystack.get_unchecked(self.start..self.end)) + }, + } + } + + #[inline] + fn next_back_inclusive(&mut self) -> Option<&'a str> + where + P::Searcher: ReverseSearcher<'a>, + { + if self.finished { + return None; + } + + if !self.allow_trailing_empty { + self.allow_trailing_empty = true; + match self.next_back_inclusive() { + Some(elt) if !elt.is_empty() => return Some(elt), + _ => { + if self.finished { + return None; + } + } + } + } + + let haystack = self.matcher.haystack(); + match self.matcher.next_match_back() { + // SAFETY: `Searcher` guarantees that `b` lies on unicode boundary, + // and self.end is either the end of the original string, + // or `b` was assigned to it, so it also lies on unicode boundary. + Some((_, b)) => unsafe { + let elt = haystack.get_unchecked(b..self.end); + self.end = b; + Some(elt) + }, + // SAFETY: self.start is either the start of the original string, + // or start of a substring that represents the part of the string that hasn't + // iterated yet. Either way, it is guaranteed to lie on unicode boundary. + // self.end is either the end of the original string, + // or `b` was assigned to it, so it also lies on unicode boundary. + None => unsafe { + self.finished = true; + Some(haystack.get_unchecked(self.start..self.end)) + }, + } + } +} + +generate_pattern_iterators! { + forward: + /// Created with the method [`split`]. + /// + /// [`split`]: str::split + struct Split; + reverse: + /// Created with the method [`rsplit`]. + /// + /// [`rsplit`]: str::rsplit + struct RSplit; + stability: + #[stable(feature = "rust1", since = "1.0.0")] + internal: + SplitInternal yielding (&'a str); + delegate double ended; +} + +generate_pattern_iterators! { + forward: + /// Created with the method [`split_terminator`]. + /// + /// [`split_terminator`]: str::split_terminator + struct SplitTerminator; + reverse: + /// Created with the method [`rsplit_terminator`]. + /// + /// [`rsplit_terminator`]: str::rsplit_terminator + struct RSplitTerminator; + stability: + #[stable(feature = "rust1", since = "1.0.0")] + internal: + SplitInternal yielding (&'a str); + delegate double ended; +} + +derive_pattern_clone! { + clone SplitNInternal + with |s| SplitNInternal { iter: s.iter.clone(), ..*s } +} + +pub(super) struct SplitNInternal<'a, P: Pattern<'a>> { + pub(super) iter: SplitInternal<'a, P>, + /// The number of splits remaining + pub(super) count: usize, +} + +impl<'a, P> fmt::Debug for SplitNInternal<'a, P> +where + P: Pattern<'a, Searcher: fmt::Debug>, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SplitNInternal") + .field("iter", &self.iter) + .field("count", &self.count) + .finish() + } +} + +impl<'a, P: Pattern<'a>> SplitNInternal<'a, P> { + #[inline] + fn next(&mut self) -> Option<&'a str> { + match self.count { + 0 => None, + 1 => { + self.count = 0; + self.iter.get_end() + } + _ => { + self.count -= 1; + self.iter.next() + } + } + } + + #[inline] + fn next_back(&mut self) -> Option<&'a str> + where + P::Searcher: ReverseSearcher<'a>, + { + match self.count { + 0 => None, + 1 => { + self.count = 0; + self.iter.get_end() + } + _ => { + self.count -= 1; + self.iter.next_back() + } + } + } +} + +generate_pattern_iterators! { + forward: + /// Created with the method [`splitn`]. + /// + /// [`splitn`]: str::splitn + struct SplitN; + reverse: + /// Created with the method [`rsplitn`]. + /// + /// [`rsplitn`]: str::rsplitn + struct RSplitN; + stability: + #[stable(feature = "rust1", since = "1.0.0")] + internal: + SplitNInternal yielding (&'a str); + delegate single ended; +} + +derive_pattern_clone! { + clone MatchIndicesInternal + with |s| MatchIndicesInternal(s.0.clone()) +} + +pub(super) struct MatchIndicesInternal<'a, P: Pattern<'a>>(pub(super) P::Searcher); + +impl<'a, P> fmt::Debug for MatchIndicesInternal<'a, P> +where + P: Pattern<'a, Searcher: fmt::Debug>, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("MatchIndicesInternal").field(&self.0).finish() + } +} + +impl<'a, P: Pattern<'a>> MatchIndicesInternal<'a, P> { + #[inline] + fn next(&mut self) -> Option<(usize, &'a str)> { + self.0 + .next_match() + // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. + .map(|(start, end)| unsafe { (start, self.0.haystack().get_unchecked(start..end)) }) + } + + #[inline] + fn next_back(&mut self) -> Option<(usize, &'a str)> + where + P::Searcher: ReverseSearcher<'a>, + { + self.0 + .next_match_back() + // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. + .map(|(start, end)| unsafe { (start, self.0.haystack().get_unchecked(start..end)) }) + } +} + +generate_pattern_iterators! { + forward: + /// Created with the method [`match_indices`]. + /// + /// [`match_indices`]: str::match_indices + struct MatchIndices; + reverse: + /// Created with the method [`rmatch_indices`]. + /// + /// [`rmatch_indices`]: str::rmatch_indices + struct RMatchIndices; + stability: + #[stable(feature = "str_match_indices", since = "1.5.0")] + internal: + MatchIndicesInternal yielding ((usize, &'a str)); + delegate double ended; +} + +derive_pattern_clone! { + clone MatchesInternal + with |s| MatchesInternal(s.0.clone()) +} + +pub(super) struct MatchesInternal<'a, P: Pattern<'a>>(pub(super) P::Searcher); + +impl<'a, P> fmt::Debug for MatchesInternal<'a, P> +where + P: Pattern<'a, Searcher: fmt::Debug>, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("MatchesInternal").field(&self.0).finish() + } +} + +impl<'a, P: Pattern<'a>> MatchesInternal<'a, P> { + #[inline] + fn next(&mut self) -> Option<&'a str> { + // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. + self.0.next_match().map(|(a, b)| unsafe { + // Indices are known to be on utf8 boundaries + self.0.haystack().get_unchecked(a..b) + }) + } + + #[inline] + fn next_back(&mut self) -> Option<&'a str> + where + P::Searcher: ReverseSearcher<'a>, + { + // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. + self.0.next_match_back().map(|(a, b)| unsafe { + // Indices are known to be on utf8 boundaries + self.0.haystack().get_unchecked(a..b) + }) + } +} + +generate_pattern_iterators! { + forward: + /// Created with the method [`matches`]. + /// + /// [`matches`]: str::matches + struct Matches; + reverse: + /// Created with the method [`rmatches`]. + /// + /// [`rmatches`]: str::rmatches + struct RMatches; + stability: + #[stable(feature = "str_matches", since = "1.2.0")] + internal: + MatchesInternal yielding (&'a str); + delegate double ended; +} + +/// An iterator over the lines of a string, as string slices. +/// +/// This struct is created with the [`lines`] method on [`str`]. +/// See its documentation for more. +/// +/// [`lines`]: str::lines +#[stable(feature = "rust1", since = "1.0.0")] +#[derive(Clone, Debug)] +pub struct Lines<'a>(pub(super) Map, LinesAnyMap>); + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> Iterator for Lines<'a> { + type Item = &'a str; + + #[inline] + fn next(&mut self) -> Option<&'a str> { + self.0.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } + + #[inline] + fn last(mut self) -> Option<&'a str> { + self.next_back() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> DoubleEndedIterator for Lines<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + self.0.next_back() + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for Lines<'_> {} + +/// Created with the method [`lines_any`]. +/// +/// [`lines_any`]: str::lines_any +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_deprecated(since = "1.4.0", reason = "use lines()/Lines instead now")] +#[derive(Clone, Debug)] +#[allow(deprecated)] +pub struct LinesAny<'a>(pub(super) Lines<'a>); + +#[stable(feature = "rust1", since = "1.0.0")] +#[allow(deprecated)] +impl<'a> Iterator for LinesAny<'a> { + type Item = &'a str; + + #[inline] + fn next(&mut self) -> Option<&'a str> { + self.0.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +#[allow(deprecated)] +impl<'a> DoubleEndedIterator for LinesAny<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + self.0.next_back() + } +} + +#[stable(feature = "fused", since = "1.26.0")] +#[allow(deprecated)] +impl FusedIterator for LinesAny<'_> {} + +/// An iterator over the non-whitespace substrings of a string, +/// separated by any amount of whitespace. +/// +/// This struct is created by the [`split_whitespace`] method on [`str`]. +/// See its documentation for more. +/// +/// [`split_whitespace`]: str::split_whitespace +#[stable(feature = "split_whitespace", since = "1.1.0")] +#[derive(Clone, Debug)] +pub struct SplitWhitespace<'a> { + pub(super) inner: Filter, IsNotEmpty>, +} + +/// An iterator over the non-ASCII-whitespace substrings of a string, +/// separated by any amount of ASCII whitespace. +/// +/// This struct is created by the [`split_ascii_whitespace`] method on [`str`]. +/// See its documentation for more. +/// +/// [`split_ascii_whitespace`]: str::split_ascii_whitespace +#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] +#[derive(Clone, Debug)] +pub struct SplitAsciiWhitespace<'a> { + pub(super) inner: + Map, BytesIsNotEmpty>, UnsafeBytesToStr>, +} + +/// An iterator over the substrings of a string, +/// terminated by a substring matching to a predicate function +/// Unlike `Split`, it contains the matched part as a terminator +/// of the subslice. +/// +/// This struct is created by the [`split_inclusive`] method on [`str`]. +/// See its documentation for more. +/// +/// [`split_inclusive`]: str::split_inclusive +#[unstable(feature = "split_inclusive", issue = "72360")] +pub struct SplitInclusive<'a, P: Pattern<'a>>(pub(super) SplitInternal<'a, P>); + +#[stable(feature = "split_whitespace", since = "1.1.0")] +impl<'a> Iterator for SplitWhitespace<'a> { + type Item = &'a str; + + #[inline] + fn next(&mut self) -> Option<&'a str> { + self.inner.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } + + #[inline] + fn last(mut self) -> Option<&'a str> { + self.next_back() + } +} + +#[stable(feature = "split_whitespace", since = "1.1.0")] +impl<'a> DoubleEndedIterator for SplitWhitespace<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + self.inner.next_back() + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for SplitWhitespace<'_> {} + +#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] +impl<'a> Iterator for SplitAsciiWhitespace<'a> { + type Item = &'a str; + + #[inline] + fn next(&mut self) -> Option<&'a str> { + self.inner.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } + + #[inline] + fn last(mut self) -> Option<&'a str> { + self.next_back() + } +} + +#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] +impl<'a> DoubleEndedIterator for SplitAsciiWhitespace<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + self.inner.next_back() + } +} + +#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] +impl FusedIterator for SplitAsciiWhitespace<'_> {} + +#[unstable(feature = "split_inclusive", issue = "72360")] +impl<'a, P: Pattern<'a>> Iterator for SplitInclusive<'a, P> { + type Item = &'a str; + + #[inline] + fn next(&mut self) -> Option<&'a str> { + self.0.next_inclusive() + } +} + +#[unstable(feature = "split_inclusive", issue = "72360")] +impl<'a, P: Pattern<'a, Searcher: fmt::Debug>> fmt::Debug for SplitInclusive<'a, P> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SplitInclusive").field("0", &self.0).finish() + } +} + +// FIXME(#26925) Remove in favor of `#[derive(Clone)]` +#[unstable(feature = "split_inclusive", issue = "72360")] +impl<'a, P: Pattern<'a, Searcher: Clone>> Clone for SplitInclusive<'a, P> { + fn clone(&self) -> Self { + SplitInclusive(self.0.clone()) + } +} + +#[unstable(feature = "split_inclusive", issue = "72360")] +impl<'a, P: Pattern<'a, Searcher: ReverseSearcher<'a>>> DoubleEndedIterator + for SplitInclusive<'a, P> +{ + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + self.0.next_back_inclusive() + } +} + +#[unstable(feature = "split_inclusive", issue = "72360")] +impl<'a, P: Pattern<'a>> FusedIterator for SplitInclusive<'a, P> {} + +/// An iterator of [`u16`] over the string encoded as UTF-16. +/// +/// This struct is created by the [`encode_utf16`] method on [`str`]. +/// See its documentation for more. +/// +/// [`encode_utf16`]: str::encode_utf16 +#[derive(Clone)] +#[stable(feature = "encode_utf16", since = "1.8.0")] +pub struct EncodeUtf16<'a> { + pub(super) chars: Chars<'a>, + pub(super) extra: u16, +} + +#[stable(feature = "collection_debug", since = "1.17.0")] +impl fmt::Debug for EncodeUtf16<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.pad("EncodeUtf16 { .. }") + } +} + +#[stable(feature = "encode_utf16", since = "1.8.0")] +impl<'a> Iterator for EncodeUtf16<'a> { + type Item = u16; + + #[inline] + fn next(&mut self) -> Option { + if self.extra != 0 { + let tmp = self.extra; + self.extra = 0; + return Some(tmp); + } + + let mut buf = [0; 2]; + self.chars.next().map(|ch| { + let n = ch.encode_utf16(&mut buf).len(); + if n == 2 { + self.extra = buf[1]; + } + buf[0] + }) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (low, high) = self.chars.size_hint(); + // every char gets either one u16 or two u16, + // so this iterator is between 1 or 2 times as + // long as the underlying iterator. + (low, high.and_then(|n| n.checked_mul(2))) + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for EncodeUtf16<'_> {} + +/// The return type of [`str::escape_debug`]. +#[stable(feature = "str_escape", since = "1.34.0")] +#[derive(Clone, Debug)] +pub struct EscapeDebug<'a> { + pub(super) inner: Chain< + Flatten>, + FlatMap, char::EscapeDebug, CharEscapeDebugContinue>, + >, +} + +/// The return type of [`str::escape_default`]. +#[stable(feature = "str_escape", since = "1.34.0")] +#[derive(Clone, Debug)] +pub struct EscapeDefault<'a> { + pub(super) inner: FlatMap, char::EscapeDefault, CharEscapeDefault>, +} + +/// The return type of [`str::escape_unicode`]. +#[stable(feature = "str_escape", since = "1.34.0")] +#[derive(Clone, Debug)] +pub struct EscapeUnicode<'a> { + pub(super) inner: FlatMap, char::EscapeUnicode, CharEscapeUnicode>, +} + +macro_rules! escape_types_impls { + ($( $Name: ident ),+) => {$( + #[stable(feature = "str_escape", since = "1.34.0")] + impl<'a> fmt::Display for $Name<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.clone().try_for_each(|c| f.write_char(c)) + } + } + + #[stable(feature = "str_escape", since = "1.34.0")] + impl<'a> Iterator for $Name<'a> { + type Item = char; + + #[inline] + fn next(&mut self) -> Option { self.inner.next() } + + #[inline] + fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } + + #[inline] + fn try_fold(&mut self, init: Acc, fold: Fold) -> R where + Self: Sized, Fold: FnMut(Acc, Self::Item) -> R, R: Try + { + self.inner.try_fold(init, fold) + } + + #[inline] + fn fold(self, init: Acc, fold: Fold) -> Acc + where Fold: FnMut(Acc, Self::Item) -> Acc, + { + self.inner.fold(init, fold) + } + } + + #[stable(feature = "str_escape", since = "1.34.0")] + impl<'a> FusedIterator for $Name<'a> {} + )+} +} + +escape_types_impls!(EscapeDebug, EscapeDefault, EscapeUnicode); diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 147d341c8b0..02b85ebfe49 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -9,20 +9,15 @@ #![stable(feature = "rust1", since = "1.0.0")] mod error; +mod iter; mod traits; use self::pattern::Pattern; use self::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; use crate::char; -use crate::fmt::{self, Write}; -use crate::iter::TrustedRandomAccess; -use crate::iter::{Chain, FlatMap, Flatten}; -use crate::iter::{Copied, Filter, FusedIterator, Map, TrustedLen}; use crate::mem; -use crate::ops::Try; -use crate::option; -use crate::slice::{self, SliceIndex, Split as SliceSplit}; +use crate::slice::{self, SliceIndex}; pub mod pattern; @@ -36,6 +31,41 @@ pub use error::{ParseBoolError, Utf8Error}; #[stable(feature = "rust1", since = "1.0.0")] pub use traits::FromStr; +#[stable(feature = "rust1", since = "1.0.0")] +pub use iter::{Bytes, CharIndices, Chars, Lines, SplitWhitespace}; + +#[stable(feature = "rust1", since = "1.0.0")] +#[allow(deprecated)] +pub use iter::LinesAny; + +#[stable(feature = "rust1", since = "1.0.0")] +pub use iter::{RSplit, RSplitTerminator, Split, SplitTerminator}; + +#[stable(feature = "rust1", since = "1.0.0")] +pub use iter::{RSplitN, SplitN}; + +#[stable(feature = "str_matches", since = "1.2.0")] +pub use iter::{Matches, RMatches}; + +#[stable(feature = "str_match_indices", since = "1.5.0")] +pub use iter::{MatchIndices, RMatchIndices}; + +#[stable(feature = "encode_utf16", since = "1.8.0")] +pub use iter::EncodeUtf16; + +#[stable(feature = "str_escape", since = "1.34.0")] +pub use iter::{EscapeDebug, EscapeDefault, EscapeUnicode}; + +#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] +pub use iter::SplitAsciiWhitespace; + +#[unstable(feature = "split_inclusive", issue = "72360")] +use iter::SplitInclusive; + +use iter::MatchIndicesInternal; +use iter::SplitInternal; +use iter::{MatchesInternal, SplitNInternal}; + /* Section: Creating a string */ @@ -227,24 +257,6 @@ pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str { unsafe { &mut *(v as *mut [u8] as *mut str) } } -/* -Section: Iterators -*/ - -/// An iterator over the [`char`]s of a string slice. -/// -/// -/// This struct is created by the [`chars`] method on [`str`]. -/// See its documentation for more. -/// -/// [`char`]: prim@char -/// [`chars`]: str::chars -#[derive(Clone)] -#[stable(feature = "rust1", since = "1.0.0")] -pub struct Chars<'a> { - iter: slice::Iter<'a, u8>, -} - /// Returns the initial codepoint accumulator for the first byte. /// The first byte is special, only want bottom 5 bits for width 2, 4 bits /// for width 3, and 3 bits for width 4. @@ -341,940 +353,6 @@ where Some(ch) } -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a> Iterator for Chars<'a> { - type Item = char; - - #[inline] - fn next(&mut self) -> Option { - next_code_point(&mut self.iter).map(|ch| { - // SAFETY: `str` invariant says `ch` is a valid Unicode Scalar Value. - unsafe { char::from_u32_unchecked(ch) } - }) - } - - #[inline] - fn count(self) -> usize { - // length in `char` is equal to the number of non-continuation bytes - let bytes_len = self.iter.len(); - let mut cont_bytes = 0; - for &byte in self.iter { - cont_bytes += utf8_is_cont_byte(byte) as usize; - } - bytes_len - cont_bytes - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let len = self.iter.len(); - // `(len + 3)` can't overflow, because we know that the `slice::Iter` - // belongs to a slice in memory which has a maximum length of - // `isize::MAX` (that's well below `usize::MAX`). - ((len + 3) / 4, Some(len)) - } - - #[inline] - fn last(mut self) -> Option { - // No need to go through the entire string. - self.next_back() - } -} - -#[stable(feature = "chars_debug_impl", since = "1.38.0")] -impl fmt::Debug for Chars<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "Chars(")?; - f.debug_list().entries(self.clone()).finish()?; - write!(f, ")")?; - Ok(()) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a> DoubleEndedIterator for Chars<'a> { - #[inline] - fn next_back(&mut self) -> Option { - next_code_point_reverse(&mut self.iter).map(|ch| { - // SAFETY: `str` invariant says `ch` is a valid Unicode Scalar Value. - unsafe { char::from_u32_unchecked(ch) } - }) - } -} - -#[stable(feature = "fused", since = "1.26.0")] -impl FusedIterator for Chars<'_> {} - -impl<'a> Chars<'a> { - /// Views the underlying data as a subslice of the original data. - /// - /// This has the same lifetime as the original slice, and so the - /// iterator can continue to be used while this exists. - /// - /// # Examples - /// - /// ``` - /// let mut chars = "abc".chars(); - /// - /// assert_eq!(chars.as_str(), "abc"); - /// chars.next(); - /// assert_eq!(chars.as_str(), "bc"); - /// chars.next(); - /// chars.next(); - /// assert_eq!(chars.as_str(), ""); - /// ``` - #[stable(feature = "iter_to_slice", since = "1.4.0")] - #[inline] - pub fn as_str(&self) -> &'a str { - // SAFETY: `Chars` is only made from a str, which guarantees the iter is valid UTF-8. - unsafe { from_utf8_unchecked(self.iter.as_slice()) } - } -} - -/// An iterator over the [`char`]s of a string slice, and their positions. -/// -/// This struct is created by the [`char_indices`] method on [`str`]. -/// See its documentation for more. -/// -/// [`char`]: prim@char -/// [`char_indices`]: str::char_indices -#[derive(Clone, Debug)] -#[stable(feature = "rust1", since = "1.0.0")] -pub struct CharIndices<'a> { - front_offset: usize, - iter: Chars<'a>, -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a> Iterator for CharIndices<'a> { - type Item = (usize, char); - - #[inline] - fn next(&mut self) -> Option<(usize, char)> { - let pre_len = self.iter.iter.len(); - match self.iter.next() { - None => None, - Some(ch) => { - let index = self.front_offset; - let len = self.iter.iter.len(); - self.front_offset += pre_len - len; - Some((index, ch)) - } - } - } - - #[inline] - fn count(self) -> usize { - self.iter.count() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.iter.size_hint() - } - - #[inline] - fn last(mut self) -> Option<(usize, char)> { - // No need to go through the entire string. - self.next_back() - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a> DoubleEndedIterator for CharIndices<'a> { - #[inline] - fn next_back(&mut self) -> Option<(usize, char)> { - self.iter.next_back().map(|ch| { - let index = self.front_offset + self.iter.iter.len(); - (index, ch) - }) - } -} - -#[stable(feature = "fused", since = "1.26.0")] -impl FusedIterator for CharIndices<'_> {} - -impl<'a> CharIndices<'a> { - /// Views the underlying data as a subslice of the original data. - /// - /// This has the same lifetime as the original slice, and so the - /// iterator can continue to be used while this exists. - #[stable(feature = "iter_to_slice", since = "1.4.0")] - #[inline] - pub fn as_str(&self) -> &'a str { - self.iter.as_str() - } -} - -/// An iterator over the bytes of a string slice. -/// -/// This struct is created by the [`bytes`] method on [`str`]. -/// See its documentation for more. -/// -/// [`bytes`]: str::bytes -#[stable(feature = "rust1", since = "1.0.0")] -#[derive(Clone, Debug)] -pub struct Bytes<'a>(Copied>); - -#[stable(feature = "rust1", since = "1.0.0")] -impl Iterator for Bytes<'_> { - type Item = u8; - - #[inline] - fn next(&mut self) -> Option { - self.0.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.0.size_hint() - } - - #[inline] - fn count(self) -> usize { - self.0.count() - } - - #[inline] - fn last(self) -> Option { - self.0.last() - } - - #[inline] - fn nth(&mut self, n: usize) -> Option { - self.0.nth(n) - } - - #[inline] - fn all(&mut self, f: F) -> bool - where - F: FnMut(Self::Item) -> bool, - { - self.0.all(f) - } - - #[inline] - fn any(&mut self, f: F) -> bool - where - F: FnMut(Self::Item) -> bool, - { - self.0.any(f) - } - - #[inline] - fn find

(&mut self, predicate: P) -> Option - where - P: FnMut(&Self::Item) -> bool, - { - self.0.find(predicate) - } - - #[inline] - fn position

(&mut self, predicate: P) -> Option - where - P: FnMut(Self::Item) -> bool, - { - self.0.position(predicate) - } - - #[inline] - fn rposition

(&mut self, predicate: P) -> Option - where - P: FnMut(Self::Item) -> bool, - { - self.0.rposition(predicate) - } - - #[inline] - unsafe fn __iterator_get_unchecked(&mut self, idx: usize) -> u8 { - // SAFETY: the caller must uphold the safety contract - // for `Iterator::__iterator_get_unchecked`. - unsafe { self.0.__iterator_get_unchecked(idx) } - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl DoubleEndedIterator for Bytes<'_> { - #[inline] - fn next_back(&mut self) -> Option { - self.0.next_back() - } - - #[inline] - fn nth_back(&mut self, n: usize) -> Option { - self.0.nth_back(n) - } - - #[inline] - fn rfind

(&mut self, predicate: P) -> Option - where - P: FnMut(&Self::Item) -> bool, - { - self.0.rfind(predicate) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl ExactSizeIterator for Bytes<'_> { - #[inline] - fn len(&self) -> usize { - self.0.len() - } - - #[inline] - fn is_empty(&self) -> bool { - self.0.is_empty() - } -} - -#[stable(feature = "fused", since = "1.26.0")] -impl FusedIterator for Bytes<'_> {} - -#[unstable(feature = "trusted_len", issue = "37572")] -unsafe impl TrustedLen for Bytes<'_> {} - -#[doc(hidden)] -#[unstable(feature = "trusted_random_access", issue = "none")] -unsafe impl TrustedRandomAccess for Bytes<'_> { - fn may_have_side_effect() -> bool { - false - } -} - -/// This macro generates a Clone impl for string pattern API -/// wrapper types of the form X<'a, P> -macro_rules! derive_pattern_clone { - (clone $t:ident with |$s:ident| $e:expr) => { - impl<'a, P> Clone for $t<'a, P> - where - P: Pattern<'a, Searcher: Clone>, - { - fn clone(&self) -> Self { - let $s = self; - $e - } - } - }; -} - -/// This macro generates two public iterator structs -/// wrapping a private internal one that makes use of the `Pattern` API. -/// -/// For all patterns `P: Pattern<'a>` the following items will be -/// generated (generics omitted): -/// -/// struct $forward_iterator($internal_iterator); -/// struct $reverse_iterator($internal_iterator); -/// -/// impl Iterator for $forward_iterator -/// { /* internal ends up calling Searcher::next_match() */ } -/// -/// impl DoubleEndedIterator for $forward_iterator -/// where P::Searcher: DoubleEndedSearcher -/// { /* internal ends up calling Searcher::next_match_back() */ } -/// -/// impl Iterator for $reverse_iterator -/// where P::Searcher: ReverseSearcher -/// { /* internal ends up calling Searcher::next_match_back() */ } -/// -/// impl DoubleEndedIterator for $reverse_iterator -/// where P::Searcher: DoubleEndedSearcher -/// { /* internal ends up calling Searcher::next_match() */ } -/// -/// The internal one is defined outside the macro, and has almost the same -/// semantic as a DoubleEndedIterator by delegating to `pattern::Searcher` and -/// `pattern::ReverseSearcher` for both forward and reverse iteration. -/// -/// "Almost", because a `Searcher` and a `ReverseSearcher` for a given -/// `Pattern` might not return the same elements, so actually implementing -/// `DoubleEndedIterator` for it would be incorrect. -/// (See the docs in `str::pattern` for more details) -/// -/// However, the internal struct still represents a single ended iterator from -/// either end, and depending on pattern is also a valid double ended iterator, -/// so the two wrapper structs implement `Iterator` -/// and `DoubleEndedIterator` depending on the concrete pattern type, leading -/// to the complex impls seen above. -macro_rules! generate_pattern_iterators { - { - // Forward iterator - forward: - $(#[$forward_iterator_attribute:meta])* - struct $forward_iterator:ident; - - // Reverse iterator - reverse: - $(#[$reverse_iterator_attribute:meta])* - struct $reverse_iterator:ident; - - // Stability of all generated items - stability: - $(#[$common_stability_attribute:meta])* - - // Internal almost-iterator that is being delegated to - internal: - $internal_iterator:ident yielding ($iterty:ty); - - // Kind of delegation - either single ended or double ended - delegate $($t:tt)* - } => { - $(#[$forward_iterator_attribute])* - $(#[$common_stability_attribute])* - pub struct $forward_iterator<'a, P: Pattern<'a>>($internal_iterator<'a, P>); - - $(#[$common_stability_attribute])* - impl<'a, P> fmt::Debug for $forward_iterator<'a, P> - where - P: Pattern<'a, Searcher: fmt::Debug>, - { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_tuple(stringify!($forward_iterator)) - .field(&self.0) - .finish() - } - } - - $(#[$common_stability_attribute])* - impl<'a, P: Pattern<'a>> Iterator for $forward_iterator<'a, P> { - type Item = $iterty; - - #[inline] - fn next(&mut self) -> Option<$iterty> { - self.0.next() - } - } - - $(#[$common_stability_attribute])* - impl<'a, P> Clone for $forward_iterator<'a, P> - where - P: Pattern<'a, Searcher: Clone>, - { - fn clone(&self) -> Self { - $forward_iterator(self.0.clone()) - } - } - - $(#[$reverse_iterator_attribute])* - $(#[$common_stability_attribute])* - pub struct $reverse_iterator<'a, P: Pattern<'a>>($internal_iterator<'a, P>); - - $(#[$common_stability_attribute])* - impl<'a, P> fmt::Debug for $reverse_iterator<'a, P> - where - P: Pattern<'a, Searcher: fmt::Debug>, - { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_tuple(stringify!($reverse_iterator)) - .field(&self.0) - .finish() - } - } - - $(#[$common_stability_attribute])* - impl<'a, P> Iterator for $reverse_iterator<'a, P> - where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, - { - type Item = $iterty; - - #[inline] - fn next(&mut self) -> Option<$iterty> { - self.0.next_back() - } - } - - $(#[$common_stability_attribute])* - impl<'a, P> Clone for $reverse_iterator<'a, P> - where - P: Pattern<'a, Searcher: Clone>, - { - fn clone(&self) -> Self { - $reverse_iterator(self.0.clone()) - } - } - - #[stable(feature = "fused", since = "1.26.0")] - impl<'a, P: Pattern<'a>> FusedIterator for $forward_iterator<'a, P> {} - - #[stable(feature = "fused", since = "1.26.0")] - impl<'a, P> FusedIterator for $reverse_iterator<'a, P> - where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, - {} - - generate_pattern_iterators!($($t)* with $(#[$common_stability_attribute])*, - $forward_iterator, - $reverse_iterator, $iterty); - }; - { - double ended; with $(#[$common_stability_attribute:meta])*, - $forward_iterator:ident, - $reverse_iterator:ident, $iterty:ty - } => { - $(#[$common_stability_attribute])* - impl<'a, P> DoubleEndedIterator for $forward_iterator<'a, P> - where - P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, - { - #[inline] - fn next_back(&mut self) -> Option<$iterty> { - self.0.next_back() - } - } - - $(#[$common_stability_attribute])* - impl<'a, P> DoubleEndedIterator for $reverse_iterator<'a, P> - where - P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, - { - #[inline] - fn next_back(&mut self) -> Option<$iterty> { - self.0.next() - } - } - }; - { - single ended; with $(#[$common_stability_attribute:meta])*, - $forward_iterator:ident, - $reverse_iterator:ident, $iterty:ty - } => {} -} - -derive_pattern_clone! { - clone SplitInternal - with |s| SplitInternal { matcher: s.matcher.clone(), ..*s } -} - -struct SplitInternal<'a, P: Pattern<'a>> { - start: usize, - end: usize, - matcher: P::Searcher, - allow_trailing_empty: bool, - finished: bool, -} - -impl<'a, P> fmt::Debug for SplitInternal<'a, P> -where - P: Pattern<'a, Searcher: fmt::Debug>, -{ - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SplitInternal") - .field("start", &self.start) - .field("end", &self.end) - .field("matcher", &self.matcher) - .field("allow_trailing_empty", &self.allow_trailing_empty) - .field("finished", &self.finished) - .finish() - } -} - -impl<'a, P: Pattern<'a>> SplitInternal<'a, P> { - #[inline] - fn get_end(&mut self) -> Option<&'a str> { - if !self.finished && (self.allow_trailing_empty || self.end - self.start > 0) { - self.finished = true; - // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. - unsafe { - let string = self.matcher.haystack().get_unchecked(self.start..self.end); - Some(string) - } - } else { - None - } - } - - #[inline] - fn next(&mut self) -> Option<&'a str> { - if self.finished { - return None; - } - - let haystack = self.matcher.haystack(); - match self.matcher.next_match() { - // SAFETY: `Searcher` guarantees that `a` and `b` lie on unicode boundaries. - Some((a, b)) => unsafe { - let elt = haystack.get_unchecked(self.start..a); - self.start = b; - Some(elt) - }, - None => self.get_end(), - } - } - - #[inline] - fn next_inclusive(&mut self) -> Option<&'a str> { - if self.finished { - return None; - } - - let haystack = self.matcher.haystack(); - match self.matcher.next_match() { - // SAFETY: `Searcher` guarantees that `b` lies on unicode boundary, - // and self.start is either the start of the original string, - // or `b` was assigned to it, so it also lies on unicode boundary. - Some((_, b)) => unsafe { - let elt = haystack.get_unchecked(self.start..b); - self.start = b; - Some(elt) - }, - None => self.get_end(), - } - } - - #[inline] - fn next_back(&mut self) -> Option<&'a str> - where - P::Searcher: ReverseSearcher<'a>, - { - if self.finished { - return None; - } - - if !self.allow_trailing_empty { - self.allow_trailing_empty = true; - match self.next_back() { - Some(elt) if !elt.is_empty() => return Some(elt), - _ => { - if self.finished { - return None; - } - } - } - } - - let haystack = self.matcher.haystack(); - match self.matcher.next_match_back() { - // SAFETY: `Searcher` guarantees that `a` and `b` lie on unicode boundaries. - Some((a, b)) => unsafe { - let elt = haystack.get_unchecked(b..self.end); - self.end = a; - Some(elt) - }, - // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. - None => unsafe { - self.finished = true; - Some(haystack.get_unchecked(self.start..self.end)) - }, - } - } - - #[inline] - fn next_back_inclusive(&mut self) -> Option<&'a str> - where - P::Searcher: ReverseSearcher<'a>, - { - if self.finished { - return None; - } - - if !self.allow_trailing_empty { - self.allow_trailing_empty = true; - match self.next_back_inclusive() { - Some(elt) if !elt.is_empty() => return Some(elt), - _ => { - if self.finished { - return None; - } - } - } - } - - let haystack = self.matcher.haystack(); - match self.matcher.next_match_back() { - // SAFETY: `Searcher` guarantees that `b` lies on unicode boundary, - // and self.end is either the end of the original string, - // or `b` was assigned to it, so it also lies on unicode boundary. - Some((_, b)) => unsafe { - let elt = haystack.get_unchecked(b..self.end); - self.end = b; - Some(elt) - }, - // SAFETY: self.start is either the start of the original string, - // or start of a substring that represents the part of the string that hasn't - // iterated yet. Either way, it is guaranteed to lie on unicode boundary. - // self.end is either the end of the original string, - // or `b` was assigned to it, so it also lies on unicode boundary. - None => unsafe { - self.finished = true; - Some(haystack.get_unchecked(self.start..self.end)) - }, - } - } -} - -generate_pattern_iterators! { - forward: - /// Created with the method [`split`]. - /// - /// [`split`]: str::split - struct Split; - reverse: - /// Created with the method [`rsplit`]. - /// - /// [`rsplit`]: str::rsplit - struct RSplit; - stability: - #[stable(feature = "rust1", since = "1.0.0")] - internal: - SplitInternal yielding (&'a str); - delegate double ended; -} - -generate_pattern_iterators! { - forward: - /// Created with the method [`split_terminator`]. - /// - /// [`split_terminator`]: str::split_terminator - struct SplitTerminator; - reverse: - /// Created with the method [`rsplit_terminator`]. - /// - /// [`rsplit_terminator`]: str::rsplit_terminator - struct RSplitTerminator; - stability: - #[stable(feature = "rust1", since = "1.0.0")] - internal: - SplitInternal yielding (&'a str); - delegate double ended; -} - -derive_pattern_clone! { - clone SplitNInternal - with |s| SplitNInternal { iter: s.iter.clone(), ..*s } -} - -struct SplitNInternal<'a, P: Pattern<'a>> { - iter: SplitInternal<'a, P>, - /// The number of splits remaining - count: usize, -} - -impl<'a, P> fmt::Debug for SplitNInternal<'a, P> -where - P: Pattern<'a, Searcher: fmt::Debug>, -{ - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SplitNInternal") - .field("iter", &self.iter) - .field("count", &self.count) - .finish() - } -} - -impl<'a, P: Pattern<'a>> SplitNInternal<'a, P> { - #[inline] - fn next(&mut self) -> Option<&'a str> { - match self.count { - 0 => None, - 1 => { - self.count = 0; - self.iter.get_end() - } - _ => { - self.count -= 1; - self.iter.next() - } - } - } - - #[inline] - fn next_back(&mut self) -> Option<&'a str> - where - P::Searcher: ReverseSearcher<'a>, - { - match self.count { - 0 => None, - 1 => { - self.count = 0; - self.iter.get_end() - } - _ => { - self.count -= 1; - self.iter.next_back() - } - } - } -} - -generate_pattern_iterators! { - forward: - /// Created with the method [`splitn`]. - /// - /// [`splitn`]: str::splitn - struct SplitN; - reverse: - /// Created with the method [`rsplitn`]. - /// - /// [`rsplitn`]: str::rsplitn - struct RSplitN; - stability: - #[stable(feature = "rust1", since = "1.0.0")] - internal: - SplitNInternal yielding (&'a str); - delegate single ended; -} - -derive_pattern_clone! { - clone MatchIndicesInternal - with |s| MatchIndicesInternal(s.0.clone()) -} - -struct MatchIndicesInternal<'a, P: Pattern<'a>>(P::Searcher); - -impl<'a, P> fmt::Debug for MatchIndicesInternal<'a, P> -where - P: Pattern<'a, Searcher: fmt::Debug>, -{ - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_tuple("MatchIndicesInternal").field(&self.0).finish() - } -} - -impl<'a, P: Pattern<'a>> MatchIndicesInternal<'a, P> { - #[inline] - fn next(&mut self) -> Option<(usize, &'a str)> { - self.0 - .next_match() - // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. - .map(|(start, end)| unsafe { (start, self.0.haystack().get_unchecked(start..end)) }) - } - - #[inline] - fn next_back(&mut self) -> Option<(usize, &'a str)> - where - P::Searcher: ReverseSearcher<'a>, - { - self.0 - .next_match_back() - // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. - .map(|(start, end)| unsafe { (start, self.0.haystack().get_unchecked(start..end)) }) - } -} - -generate_pattern_iterators! { - forward: - /// Created with the method [`match_indices`]. - /// - /// [`match_indices`]: str::match_indices - struct MatchIndices; - reverse: - /// Created with the method [`rmatch_indices`]. - /// - /// [`rmatch_indices`]: str::rmatch_indices - struct RMatchIndices; - stability: - #[stable(feature = "str_match_indices", since = "1.5.0")] - internal: - MatchIndicesInternal yielding ((usize, &'a str)); - delegate double ended; -} - -derive_pattern_clone! { - clone MatchesInternal - with |s| MatchesInternal(s.0.clone()) -} - -struct MatchesInternal<'a, P: Pattern<'a>>(P::Searcher); - -impl<'a, P> fmt::Debug for MatchesInternal<'a, P> -where - P: Pattern<'a, Searcher: fmt::Debug>, -{ - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_tuple("MatchesInternal").field(&self.0).finish() - } -} - -impl<'a, P: Pattern<'a>> MatchesInternal<'a, P> { - #[inline] - fn next(&mut self) -> Option<&'a str> { - // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. - self.0.next_match().map(|(a, b)| unsafe { - // Indices are known to be on utf8 boundaries - self.0.haystack().get_unchecked(a..b) - }) - } - - #[inline] - fn next_back(&mut self) -> Option<&'a str> - where - P::Searcher: ReverseSearcher<'a>, - { - // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. - self.0.next_match_back().map(|(a, b)| unsafe { - // Indices are known to be on utf8 boundaries - self.0.haystack().get_unchecked(a..b) - }) - } -} - -generate_pattern_iterators! { - forward: - /// Created with the method [`matches`]. - /// - /// [`matches`]: str::matches - struct Matches; - reverse: - /// Created with the method [`rmatches`]. - /// - /// [`rmatches`]: str::rmatches - struct RMatches; - stability: - #[stable(feature = "str_matches", since = "1.2.0")] - internal: - MatchesInternal yielding (&'a str); - delegate double ended; -} - -/// An iterator over the lines of a string, as string slices. -/// -/// This struct is created with the [`lines`] method on [`str`]. -/// See its documentation for more. -/// -/// [`lines`]: str::lines -#[stable(feature = "rust1", since = "1.0.0")] -#[derive(Clone, Debug)] -pub struct Lines<'a>(Map, LinesAnyMap>); - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a> Iterator for Lines<'a> { - type Item = &'a str; - - #[inline] - fn next(&mut self) -> Option<&'a str> { - self.0.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.0.size_hint() - } - - #[inline] - fn last(mut self) -> Option<&'a str> { - self.next_back() - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a> DoubleEndedIterator for Lines<'a> { - #[inline] - fn next_back(&mut self) -> Option<&'a str> { - self.0.next_back() - } -} - -#[stable(feature = "fused", since = "1.26.0")] -impl FusedIterator for Lines<'_> {} - -/// Created with the method [`lines_any`]. -/// -/// [`lines_any`]: str::lines_any -#[stable(feature = "rust1", since = "1.0.0")] -#[rustc_deprecated(since = "1.4.0", reason = "use lines()/Lines instead now")] -#[derive(Clone, Debug)] -#[allow(deprecated)] -pub struct LinesAny<'a>(Lines<'a>); - impl_fn_for_zst! { /// A nameable, cloneable fn type #[derive(Clone)] @@ -1285,35 +363,6 @@ impl_fn_for_zst! { }; } -#[stable(feature = "rust1", since = "1.0.0")] -#[allow(deprecated)] -impl<'a> Iterator for LinesAny<'a> { - type Item = &'a str; - - #[inline] - fn next(&mut self) -> Option<&'a str> { - self.0.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.0.size_hint() - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -#[allow(deprecated)] -impl<'a> DoubleEndedIterator for LinesAny<'a> { - #[inline] - fn next_back(&mut self) -> Option<&'a str> { - self.0.next_back() - } -} - -#[stable(feature = "fused", since = "1.26.0")] -#[allow(deprecated)] -impl FusedIterator for LinesAny<'_> {} - /* Section: UTF-8 validation */ @@ -3869,44 +2918,6 @@ impl Default for &mut str { } } -/// An iterator over the non-whitespace substrings of a string, -/// separated by any amount of whitespace. -/// -/// This struct is created by the [`split_whitespace`] method on [`str`]. -/// See its documentation for more. -/// -/// [`split_whitespace`]: str::split_whitespace -#[stable(feature = "split_whitespace", since = "1.1.0")] -#[derive(Clone, Debug)] -pub struct SplitWhitespace<'a> { - inner: Filter, IsNotEmpty>, -} - -/// An iterator over the non-ASCII-whitespace substrings of a string, -/// separated by any amount of ASCII whitespace. -/// -/// This struct is created by the [`split_ascii_whitespace`] method on [`str`]. -/// See its documentation for more. -/// -/// [`split_ascii_whitespace`]: str::split_ascii_whitespace -#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] -#[derive(Clone, Debug)] -pub struct SplitAsciiWhitespace<'a> { - inner: Map, BytesIsNotEmpty>, UnsafeBytesToStr>, -} - -/// An iterator over the substrings of a string, -/// terminated by a substring matching to a predicate function -/// Unlike `Split`, it contains the matched part as a terminator -/// of the subslice. -/// -/// This struct is created by the [`split_inclusive`] method on [`str`]. -/// See its documentation for more. -/// -/// [`split_inclusive`]: str::split_inclusive -#[unstable(feature = "split_inclusive", issue = "72360")] -pub struct SplitInclusive<'a, P: Pattern<'a>>(SplitInternal<'a, P>); - impl_fn_for_zst! { #[derive(Clone)] struct IsWhitespace impl Fn = |c: char| -> bool { @@ -3934,223 +2945,3 @@ impl_fn_for_zst! { unsafe { from_utf8_unchecked(bytes) } }; } - -#[stable(feature = "split_whitespace", since = "1.1.0")] -impl<'a> Iterator for SplitWhitespace<'a> { - type Item = &'a str; - - #[inline] - fn next(&mut self) -> Option<&'a str> { - self.inner.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() - } - - #[inline] - fn last(mut self) -> Option<&'a str> { - self.next_back() - } -} - -#[stable(feature = "split_whitespace", since = "1.1.0")] -impl<'a> DoubleEndedIterator for SplitWhitespace<'a> { - #[inline] - fn next_back(&mut self) -> Option<&'a str> { - self.inner.next_back() - } -} - -#[stable(feature = "fused", since = "1.26.0")] -impl FusedIterator for SplitWhitespace<'_> {} - -#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] -impl<'a> Iterator for SplitAsciiWhitespace<'a> { - type Item = &'a str; - - #[inline] - fn next(&mut self) -> Option<&'a str> { - self.inner.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() - } - - #[inline] - fn last(mut self) -> Option<&'a str> { - self.next_back() - } -} - -#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] -impl<'a> DoubleEndedIterator for SplitAsciiWhitespace<'a> { - #[inline] - fn next_back(&mut self) -> Option<&'a str> { - self.inner.next_back() - } -} - -#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] -impl FusedIterator for SplitAsciiWhitespace<'_> {} - -#[unstable(feature = "split_inclusive", issue = "72360")] -impl<'a, P: Pattern<'a>> Iterator for SplitInclusive<'a, P> { - type Item = &'a str; - - #[inline] - fn next(&mut self) -> Option<&'a str> { - self.0.next_inclusive() - } -} - -#[unstable(feature = "split_inclusive", issue = "72360")] -impl<'a, P: Pattern<'a, Searcher: fmt::Debug>> fmt::Debug for SplitInclusive<'a, P> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SplitInclusive").field("0", &self.0).finish() - } -} - -// FIXME(#26925) Remove in favor of `#[derive(Clone)]` -#[unstable(feature = "split_inclusive", issue = "72360")] -impl<'a, P: Pattern<'a, Searcher: Clone>> Clone for SplitInclusive<'a, P> { - fn clone(&self) -> Self { - SplitInclusive(self.0.clone()) - } -} - -#[unstable(feature = "split_inclusive", issue = "72360")] -impl<'a, P: Pattern<'a, Searcher: ReverseSearcher<'a>>> DoubleEndedIterator - for SplitInclusive<'a, P> -{ - #[inline] - fn next_back(&mut self) -> Option<&'a str> { - self.0.next_back_inclusive() - } -} - -#[unstable(feature = "split_inclusive", issue = "72360")] -impl<'a, P: Pattern<'a>> FusedIterator for SplitInclusive<'a, P> {} - -/// An iterator of [`u16`] over the string encoded as UTF-16. -/// -/// This struct is created by the [`encode_utf16`] method on [`str`]. -/// See its documentation for more. -/// -/// [`encode_utf16`]: str::encode_utf16 -#[derive(Clone)] -#[stable(feature = "encode_utf16", since = "1.8.0")] -pub struct EncodeUtf16<'a> { - chars: Chars<'a>, - extra: u16, -} - -#[stable(feature = "collection_debug", since = "1.17.0")] -impl fmt::Debug for EncodeUtf16<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.pad("EncodeUtf16 { .. }") - } -} - -#[stable(feature = "encode_utf16", since = "1.8.0")] -impl<'a> Iterator for EncodeUtf16<'a> { - type Item = u16; - - #[inline] - fn next(&mut self) -> Option { - if self.extra != 0 { - let tmp = self.extra; - self.extra = 0; - return Some(tmp); - } - - let mut buf = [0; 2]; - self.chars.next().map(|ch| { - let n = ch.encode_utf16(&mut buf).len(); - if n == 2 { - self.extra = buf[1]; - } - buf[0] - }) - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let (low, high) = self.chars.size_hint(); - // every char gets either one u16 or two u16, - // so this iterator is between 1 or 2 times as - // long as the underlying iterator. - (low, high.and_then(|n| n.checked_mul(2))) - } -} - -#[stable(feature = "fused", since = "1.26.0")] -impl FusedIterator for EncodeUtf16<'_> {} - -/// The return type of [`str::escape_debug`]. -#[stable(feature = "str_escape", since = "1.34.0")] -#[derive(Clone, Debug)] -pub struct EscapeDebug<'a> { - inner: Chain< - Flatten>, - FlatMap, char::EscapeDebug, CharEscapeDebugContinue>, - >, -} - -/// The return type of [`str::escape_default`]. -#[stable(feature = "str_escape", since = "1.34.0")] -#[derive(Clone, Debug)] -pub struct EscapeDefault<'a> { - inner: FlatMap, char::EscapeDefault, CharEscapeDefault>, -} - -/// The return type of [`str::escape_unicode`]. -#[stable(feature = "str_escape", since = "1.34.0")] -#[derive(Clone, Debug)] -pub struct EscapeUnicode<'a> { - inner: FlatMap, char::EscapeUnicode, CharEscapeUnicode>, -} - -macro_rules! escape_types_impls { - ($( $Name: ident ),+) => {$( - #[stable(feature = "str_escape", since = "1.34.0")] - impl<'a> fmt::Display for $Name<'a> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.clone().try_for_each(|c| f.write_char(c)) - } - } - - #[stable(feature = "str_escape", since = "1.34.0")] - impl<'a> Iterator for $Name<'a> { - type Item = char; - - #[inline] - fn next(&mut self) -> Option { self.inner.next() } - - #[inline] - fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } - - #[inline] - fn try_fold(&mut self, init: Acc, fold: Fold) -> R where - Self: Sized, Fold: FnMut(Acc, Self::Item) -> R, R: Try - { - self.inner.try_fold(init, fold) - } - - #[inline] - fn fold(self, init: Acc, fold: Fold) -> Acc - where Fold: FnMut(Acc, Self::Item) -> Acc, - { - self.inner.fold(init, fold) - } - } - - #[stable(feature = "str_escape", since = "1.34.0")] - impl<'a> FusedIterator for $Name<'a> {} - )+} -} - -escape_types_impls!(EscapeDebug, EscapeDefault, EscapeUnicode); From 90c813a0f0b5042a2bbf2d9ebf27f21acdbc9f77 Mon Sep 17 00:00:00 2001 From: Lzu Tao Date: Fri, 4 Sep 2020 09:06:21 +0000 Subject: [PATCH 4/7] Move utf-8 validating helpers to new mod --- library/core/src/str/iter.rs | 2 +- library/core/src/str/lossy.rs | 10 +- library/core/src/str/mod.rs | 280 +--------------------------- library/core/src/str/validations.rs | 275 +++++++++++++++++++++++++++ 4 files changed, 288 insertions(+), 279 deletions(-) create mode 100644 library/core/src/str/validations.rs diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index 993df96a2d1..27a67e2b22f 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -12,8 +12,8 @@ use crate::slice::{self, Split as SliceSplit}; use super::from_utf8_unchecked; use super::pattern::Pattern; use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; +use super::validations::{next_code_point, next_code_point_reverse, utf8_is_cont_byte}; use super::LinesAnyMap; -use super::{next_code_point, next_code_point_reverse, utf8_is_cont_byte}; use super::{BytesIsNotEmpty, UnsafeBytesToStr}; use super::{CharEscapeDebugContinue, CharEscapeDefault, CharEscapeUnicode}; use super::{IsAsciiWhitespace, IsNotEmpty, IsWhitespace}; diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs index 88b2bc551b7..720a35bbc8f 100644 --- a/library/core/src/str/lossy.rs +++ b/library/core/src/str/lossy.rs @@ -1,7 +1,9 @@ use crate::char; use crate::fmt::{self, Write}; use crate::mem; -use crate::str as core_str; + +use super::from_utf8_unchecked; +use super::validations::utf8_char_width; /// Lossy UTF-8 string. #[unstable(feature = "str_internals", issue = "none")] @@ -66,14 +68,14 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> { if byte < 128 { } else { - let w = core_str::utf8_char_width(byte); + let w = utf8_char_width(byte); macro_rules! error { () => {{ // SAFETY: We have checked up to `i` that source is valid UTF-8. unsafe { let r = Utf8LossyChunk { - valid: core_str::from_utf8_unchecked(&self.source[0..i_]), + valid: from_utf8_unchecked(&self.source[0..i_]), broken: &self.source[i_..i], }; self.source = &self.source[i..]; @@ -133,7 +135,7 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> { let r = Utf8LossyChunk { // SAFETY: We have checked that the entire source is valid UTF-8. - valid: unsafe { core_str::from_utf8_unchecked(self.source) }, + valid: unsafe { from_utf8_unchecked(self.source) }, broken: &[], }; self.source = &[]; diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 02b85ebfe49..ab9bec2fd2d 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -11,6 +11,7 @@ mod error; mod iter; mod traits; +mod validations; use self::pattern::Pattern; use self::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; @@ -62,10 +63,15 @@ pub use iter::SplitAsciiWhitespace; #[unstable(feature = "split_inclusive", issue = "72360")] use iter::SplitInclusive; +#[unstable(feature = "str_internals", issue = "none")] +pub use validations::next_code_point; + use iter::MatchIndicesInternal; use iter::SplitInternal; use iter::{MatchesInternal, SplitNInternal}; +use validations::{run_utf8_validation, truncate_to_char_boundary}; + /* Section: Creating a string */ @@ -257,102 +263,6 @@ pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str { unsafe { &mut *(v as *mut [u8] as *mut str) } } -/// Returns the initial codepoint accumulator for the first byte. -/// The first byte is special, only want bottom 5 bits for width 2, 4 bits -/// for width 3, and 3 bits for width 4. -#[inline] -fn utf8_first_byte(byte: u8, width: u32) -> u32 { - (byte & (0x7F >> width)) as u32 -} - -/// Returns the value of `ch` updated with continuation byte `byte`. -#[inline] -fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { - (ch << 6) | (byte & CONT_MASK) as u32 -} - -/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the -/// bits `10`). -#[inline] -fn utf8_is_cont_byte(byte: u8) -> bool { - (byte & !CONT_MASK) == TAG_CONT_U8 -} - -#[inline] -fn unwrap_or_0(opt: Option<&u8>) -> u8 { - match opt { - Some(&byte) => byte, - None => 0, - } -} - -/// Reads the next code point out of a byte iterator (assuming a -/// UTF-8-like encoding). -#[unstable(feature = "str_internals", issue = "none")] -#[inline] -pub fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { - // Decode UTF-8 - let x = *bytes.next()?; - if x < 128 { - return Some(x as u32); - } - - // Multibyte case follows - // Decode from a byte combination out of: [[[x y] z] w] - // NOTE: Performance is sensitive to the exact formulation here - let init = utf8_first_byte(x, 2); - let y = unwrap_or_0(bytes.next()); - let mut ch = utf8_acc_cont_byte(init, y); - if x >= 0xE0 { - // [[x y z] w] case - // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid - let z = unwrap_or_0(bytes.next()); - let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); - ch = init << 12 | y_z; - if x >= 0xF0 { - // [x y z w] case - // use only the lower 3 bits of `init` - let w = unwrap_or_0(bytes.next()); - ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); - } - } - - Some(ch) -} - -/// Reads the last code point out of a byte iterator (assuming a -/// UTF-8-like encoding). -#[inline] -fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option -where - I: DoubleEndedIterator, -{ - // Decode UTF-8 - let w = match *bytes.next_back()? { - next_byte if next_byte < 128 => return Some(next_byte as u32), - back_byte => back_byte, - }; - - // Multibyte case follows - // Decode from a byte combination out of: [x [y [z w]]] - let mut ch; - let z = unwrap_or_0(bytes.next_back()); - ch = utf8_first_byte(z, 2); - if utf8_is_cont_byte(z) { - let y = unwrap_or_0(bytes.next_back()); - ch = utf8_first_byte(y, 3); - if utf8_is_cont_byte(y) { - let x = unwrap_or_0(bytes.next_back()); - ch = utf8_first_byte(x, 4); - ch = utf8_acc_cont_byte(ch, y); - } - ch = utf8_acc_cont_byte(ch, z); - } - ch = utf8_acc_cont_byte(ch, w); - - Some(ch) -} - impl_fn_for_zst! { /// A nameable, cloneable fn type #[derive(Clone)] @@ -363,184 +273,6 @@ impl_fn_for_zst! { }; } -/* -Section: UTF-8 validation -*/ - -// use truncation to fit u64 into usize -const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize; - -/// Returns `true` if any byte in the word `x` is nonascii (>= 128). -#[inline] -fn contains_nonascii(x: usize) -> bool { - (x & NONASCII_MASK) != 0 -} - -/// Walks through `v` checking that it's a valid UTF-8 sequence, -/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`. -#[inline(always)] -fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { - let mut index = 0; - let len = v.len(); - - let usize_bytes = mem::size_of::(); - let ascii_block_size = 2 * usize_bytes; - let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 }; - let align = v.as_ptr().align_offset(usize_bytes); - - while index < len { - let old_offset = index; - macro_rules! err { - ($error_len: expr) => { - return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len }); - }; - } - - macro_rules! next { - () => {{ - index += 1; - // we needed data, but there was none: error! - if index >= len { - err!(None) - } - v[index] - }}; - } - - let first = v[index]; - if first >= 128 { - let w = UTF8_CHAR_WIDTH[first as usize]; - // 2-byte encoding is for codepoints \u{0080} to \u{07ff} - // first C2 80 last DF BF - // 3-byte encoding is for codepoints \u{0800} to \u{ffff} - // first E0 A0 80 last EF BF BF - // excluding surrogates codepoints \u{d800} to \u{dfff} - // ED A0 80 to ED BF BF - // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff - // first F0 90 80 80 last F4 8F BF BF - // - // Use the UTF-8 syntax from the RFC - // - // https://tools.ietf.org/html/rfc3629 - // UTF8-1 = %x00-7F - // UTF8-2 = %xC2-DF UTF8-tail - // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / - // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) - // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / - // %xF4 %x80-8F 2( UTF8-tail ) - match w { - 2 => { - if next!() & !CONT_MASK != TAG_CONT_U8 { - err!(Some(1)) - } - } - 3 => { - match (first, next!()) { - (0xE0, 0xA0..=0xBF) - | (0xE1..=0xEC, 0x80..=0xBF) - | (0xED, 0x80..=0x9F) - | (0xEE..=0xEF, 0x80..=0xBF) => {} - _ => err!(Some(1)), - } - if next!() & !CONT_MASK != TAG_CONT_U8 { - err!(Some(2)) - } - } - 4 => { - match (first, next!()) { - (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {} - _ => err!(Some(1)), - } - if next!() & !CONT_MASK != TAG_CONT_U8 { - err!(Some(2)) - } - if next!() & !CONT_MASK != TAG_CONT_U8 { - err!(Some(3)) - } - } - _ => err!(Some(1)), - } - index += 1; - } else { - // Ascii case, try to skip forward quickly. - // When the pointer is aligned, read 2 words of data per iteration - // until we find a word containing a non-ascii byte. - if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 { - let ptr = v.as_ptr(); - while index < blocks_end { - // SAFETY: since `align - index` and `ascii_block_size` are - // multiples of `usize_bytes`, `block = ptr.add(index)` is - // always aligned with a `usize` so it's safe to dereference - // both `block` and `block.offset(1)`. - unsafe { - let block = ptr.add(index) as *const usize; - // break if there is a nonascii byte - let zu = contains_nonascii(*block); - let zv = contains_nonascii(*block.offset(1)); - if zu | zv { - break; - } - } - index += ascii_block_size; - } - // step from the point where the wordwise loop stopped - while index < len && v[index] < 128 { - index += 1; - } - } else { - index += 1; - } - } - } - - Ok(()) -} - -// https://tools.ietf.org/html/rfc3629 -static UTF8_CHAR_WIDTH: [u8; 256] = [ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, // 0x1F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, // 0x3F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, // 0x5F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, // 0x7F - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, // 0x9F - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, // 0xBF - 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, // 0xDF - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF - 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF -]; - -/// Given a first byte, determines how many bytes are in this UTF-8 character. -#[unstable(feature = "str_internals", issue = "none")] -#[inline] -pub fn utf8_char_width(b: u8) -> usize { - UTF8_CHAR_WIDTH[b as usize] as usize -} - -/// Mask of the value bits of a continuation byte. -const CONT_MASK: u8 = 0b0011_1111; -/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte. -const TAG_CONT_U8: u8 = 0b1000_0000; - -// truncate `&str` to length at most equal to `max` -// return `true` if it were truncated, and the new str. -fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) { - if max >= s.len() { - (false, s) - } else { - while !s.is_char_boundary(max) { - max -= 1; - } - (true, &s[..max]) - } -} - #[inline(never)] #[cold] #[track_caller] diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs new file mode 100644 index 00000000000..10cf1e172e6 --- /dev/null +++ b/library/core/src/str/validations.rs @@ -0,0 +1,275 @@ +//! Operations related to UTF-8 validation. + +use crate::mem; + +use super::Utf8Error; + +/// Returns the initial codepoint accumulator for the first byte. +/// The first byte is special, only want bottom 5 bits for width 2, 4 bits +/// for width 3, and 3 bits for width 4. +#[inline] +fn utf8_first_byte(byte: u8, width: u32) -> u32 { + (byte & (0x7F >> width)) as u32 +} + +/// Returns the value of `ch` updated with continuation byte `byte`. +#[inline] +fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { + (ch << 6) | (byte & CONT_MASK) as u32 +} + +/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the +/// bits `10`). +#[inline] +pub(super) fn utf8_is_cont_byte(byte: u8) -> bool { + (byte & !CONT_MASK) == TAG_CONT_U8 +} + +#[inline] +fn unwrap_or_0(opt: Option<&u8>) -> u8 { + match opt { + Some(&byte) => byte, + None => 0, + } +} + +/// Reads the next code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { + // Decode UTF-8 + let x = *bytes.next()?; + if x < 128 { + return Some(x as u32); + } + + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + // NOTE: Performance is sensitive to the exact formulation here + let init = utf8_first_byte(x, 2); + let y = unwrap_or_0(bytes.next()); + let mut ch = utf8_acc_cont_byte(init, y); + if x >= 0xE0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + let z = unwrap_or_0(bytes.next()); + let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xF0 { + // [x y z w] case + // use only the lower 3 bits of `init` + let w = unwrap_or_0(bytes.next()); + ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); + } + } + + Some(ch) +} + +/// Reads the last code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +#[inline] +pub(super) fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option +where + I: DoubleEndedIterator, +{ + // Decode UTF-8 + let w = match *bytes.next_back()? { + next_byte if next_byte < 128 => return Some(next_byte as u32), + back_byte => back_byte, + }; + + // Multibyte case follows + // Decode from a byte combination out of: [x [y [z w]]] + let mut ch; + let z = unwrap_or_0(bytes.next_back()); + ch = utf8_first_byte(z, 2); + if utf8_is_cont_byte(z) { + let y = unwrap_or_0(bytes.next_back()); + ch = utf8_first_byte(y, 3); + if utf8_is_cont_byte(y) { + let x = unwrap_or_0(bytes.next_back()); + ch = utf8_first_byte(x, 4); + ch = utf8_acc_cont_byte(ch, y); + } + ch = utf8_acc_cont_byte(ch, z); + } + ch = utf8_acc_cont_byte(ch, w); + + Some(ch) +} + +// use truncation to fit u64 into usize +const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize; + +/// Returns `true` if any byte in the word `x` is nonascii (>= 128). +#[inline] +fn contains_nonascii(x: usize) -> bool { + (x & NONASCII_MASK) != 0 +} + +/// Walks through `v` checking that it's a valid UTF-8 sequence, +/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`. +#[inline(always)] +pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { + let mut index = 0; + let len = v.len(); + + let usize_bytes = mem::size_of::(); + let ascii_block_size = 2 * usize_bytes; + let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 }; + let align = v.as_ptr().align_offset(usize_bytes); + + while index < len { + let old_offset = index; + macro_rules! err { + ($error_len: expr) => { + return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len }); + }; + } + + macro_rules! next { + () => {{ + index += 1; + // we needed data, but there was none: error! + if index >= len { + err!(None) + } + v[index] + }}; + } + + let first = v[index]; + if first >= 128 { + let w = UTF8_CHAR_WIDTH[first as usize]; + // 2-byte encoding is for codepoints \u{0080} to \u{07ff} + // first C2 80 last DF BF + // 3-byte encoding is for codepoints \u{0800} to \u{ffff} + // first E0 A0 80 last EF BF BF + // excluding surrogates codepoints \u{d800} to \u{dfff} + // ED A0 80 to ED BF BF + // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff + // first F0 90 80 80 last F4 8F BF BF + // + // Use the UTF-8 syntax from the RFC + // + // https://tools.ietf.org/html/rfc3629 + // UTF8-1 = %x00-7F + // UTF8-2 = %xC2-DF UTF8-tail + // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + // %xF4 %x80-8F 2( UTF8-tail ) + match w { + 2 => { + if next!() & !CONT_MASK != TAG_CONT_U8 { + err!(Some(1)) + } + } + 3 => { + match (first, next!()) { + (0xE0, 0xA0..=0xBF) + | (0xE1..=0xEC, 0x80..=0xBF) + | (0xED, 0x80..=0x9F) + | (0xEE..=0xEF, 0x80..=0xBF) => {} + _ => err!(Some(1)), + } + if next!() & !CONT_MASK != TAG_CONT_U8 { + err!(Some(2)) + } + } + 4 => { + match (first, next!()) { + (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {} + _ => err!(Some(1)), + } + if next!() & !CONT_MASK != TAG_CONT_U8 { + err!(Some(2)) + } + if next!() & !CONT_MASK != TAG_CONT_U8 { + err!(Some(3)) + } + } + _ => err!(Some(1)), + } + index += 1; + } else { + // Ascii case, try to skip forward quickly. + // When the pointer is aligned, read 2 words of data per iteration + // until we find a word containing a non-ascii byte. + if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 { + let ptr = v.as_ptr(); + while index < blocks_end { + // SAFETY: since `align - index` and `ascii_block_size` are + // multiples of `usize_bytes`, `block = ptr.add(index)` is + // always aligned with a `usize` so it's safe to dereference + // both `block` and `block.offset(1)`. + unsafe { + let block = ptr.add(index) as *const usize; + // break if there is a nonascii byte + let zu = contains_nonascii(*block); + let zv = contains_nonascii(*block.offset(1)); + if zu | zv { + break; + } + } + index += ascii_block_size; + } + // step from the point where the wordwise loop stopped + while index < len && v[index] < 128 { + index += 1; + } + } else { + index += 1; + } + } + } + + Ok(()) +} + +// https://tools.ietf.org/html/rfc3629 +static UTF8_CHAR_WIDTH: [u8; 256] = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x1F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x3F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x5F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x7F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 0x9F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 0xBF + 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, // 0xDF + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF +]; + +/// Given a first byte, determines how many bytes are in this UTF-8 character. +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub fn utf8_char_width(b: u8) -> usize { + UTF8_CHAR_WIDTH[b as usize] as usize +} + +/// Mask of the value bits of a continuation byte. +const CONT_MASK: u8 = 0b0011_1111; +/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte. +const TAG_CONT_U8: u8 = 0b1000_0000; + +// truncate `&str` to length at most equal to `max` +// return `true` if it were truncated, and the new str. +pub(super) fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) { + if max >= s.len() { + (false, s) + } else { + while !s.is_char_boundary(max) { + max -= 1; + } + (true, &s[..max]) + } +} From 653b5bf18c962a35bb6d90e13e6681446cc49878 Mon Sep 17 00:00:00 2001 From: Lzu Tao Date: Fri, 4 Sep 2020 09:37:06 +0000 Subject: [PATCH 5/7] Move functions converting bytes to str to new mod --- library/core/src/str/converts.rs | 192 +++++++++++++++++++++++++++++ library/core/src/str/error.rs | 4 +- library/core/src/str/mod.rs | 200 ++----------------------------- 3 files changed, 202 insertions(+), 194 deletions(-) create mode 100644 library/core/src/str/converts.rs diff --git a/library/core/src/str/converts.rs b/library/core/src/str/converts.rs new file mode 100644 index 00000000000..de2a93f7350 --- /dev/null +++ b/library/core/src/str/converts.rs @@ -0,0 +1,192 @@ +//! Ways to create a `str` from bytes slice. + +use crate::mem; + +use super::validations::run_utf8_validation; +use super::Utf8Error; + +/// Converts a slice of bytes to a string slice. +/// +/// A string slice ([`&str`]) is made of bytes ([`u8`]), and a byte slice +/// ([`&[u8]`][byteslice]) is made of bytes, so this function converts between +/// the two. Not all byte slices are valid string slices, however: [`&str`] requires +/// that it is valid UTF-8. `from_utf8()` checks to ensure that the bytes are valid +/// UTF-8, and then does the conversion. +/// +/// [`&str`]: str +/// [byteslice]: ../../std/primitive.slice.html +/// +/// If you are sure that the byte slice is valid UTF-8, and you don't want to +/// incur the overhead of the validity check, there is an unsafe version of +/// this function, [`from_utf8_unchecked`], which has the same +/// behavior but skips the check. +/// +/// If you need a `String` instead of a `&str`, consider +/// [`String::from_utf8`][string]. +/// +/// [string]: ../../std/string/struct.String.html#method.from_utf8 +/// +/// Because you can stack-allocate a `[u8; N]`, and you can take a +/// [`&[u8]`][byteslice] of it, this function is one way to have a +/// stack-allocated string. There is an example of this in the +/// examples section below. +/// +/// [byteslice]: ../../std/primitive.slice.html +/// +/// # Errors +/// +/// Returns `Err` if the slice is not UTF-8 with a description as to why the +/// provided slice is not UTF-8. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::str; +/// +/// // some bytes, in a vector +/// let sparkle_heart = vec![240, 159, 146, 150]; +/// +/// // We know these bytes are valid, so just use `unwrap()`. +/// let sparkle_heart = str::from_utf8(&sparkle_heart).unwrap(); +/// +/// assert_eq!("💖", sparkle_heart); +/// ``` +/// +/// Incorrect bytes: +/// +/// ``` +/// use std::str; +/// +/// // some invalid bytes, in a vector +/// let sparkle_heart = vec![0, 159, 146, 150]; +/// +/// assert!(str::from_utf8(&sparkle_heart).is_err()); +/// ``` +/// +/// See the docs for [`Utf8Error`] for more details on the kinds of +/// errors that can be returned. +/// +/// A "stack allocated string": +/// +/// ``` +/// use std::str; +/// +/// // some bytes, in a stack-allocated array +/// let sparkle_heart = [240, 159, 146, 150]; +/// +/// // We know these bytes are valid, so just use `unwrap()`. +/// let sparkle_heart = str::from_utf8(&sparkle_heart).unwrap(); +/// +/// assert_eq!("💖", sparkle_heart); +/// ``` +#[stable(feature = "rust1", since = "1.0.0")] +pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { + run_utf8_validation(v)?; + // SAFETY: Just ran validation. + Ok(unsafe { from_utf8_unchecked(v) }) +} + +/// Converts a mutable slice of bytes to a mutable string slice. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::str; +/// +/// // "Hello, Rust!" as a mutable vector +/// let mut hellorust = vec![72, 101, 108, 108, 111, 44, 32, 82, 117, 115, 116, 33]; +/// +/// // As we know these bytes are valid, we can use `unwrap()` +/// let outstr = str::from_utf8_mut(&mut hellorust).unwrap(); +/// +/// assert_eq!("Hello, Rust!", outstr); +/// ``` +/// +/// Incorrect bytes: +/// +/// ``` +/// use std::str; +/// +/// // Some invalid bytes in a mutable vector +/// let mut invalid = vec![128, 223]; +/// +/// assert!(str::from_utf8_mut(&mut invalid).is_err()); +/// ``` +/// See the docs for [`Utf8Error`] for more details on the kinds of +/// errors that can be returned. +#[stable(feature = "str_mut_extras", since = "1.20.0")] +pub fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> { + run_utf8_validation(v)?; + // SAFETY: Just ran validation. + Ok(unsafe { from_utf8_unchecked_mut(v) }) +} + +/// Converts a slice of bytes to a string slice without checking +/// that the string contains valid UTF-8. +/// +/// See the safe version, [`from_utf8`], for more information. +/// +/// # Safety +/// +/// This function is unsafe because it does not check that the bytes passed to +/// it are valid UTF-8. If this constraint is violated, undefined behavior +/// results, as the rest of Rust assumes that [`&str`]s are valid UTF-8. +/// +/// [`&str`]: str +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::str; +/// +/// // some bytes, in a vector +/// let sparkle_heart = vec![240, 159, 146, 150]; +/// +/// let sparkle_heart = unsafe { +/// str::from_utf8_unchecked(&sparkle_heart) +/// }; +/// +/// assert_eq!("💖", sparkle_heart); +/// ``` +#[inline] +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_const_unstable(feature = "const_str_from_utf8_unchecked", issue = "75196")] +#[allow_internal_unstable(const_fn_transmute)] +pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str { + // SAFETY: the caller must guarantee that the bytes `v` are valid UTF-8. + // Also relies on `&str` and `&[u8]` having the same layout. + unsafe { mem::transmute(v) } +} + +/// Converts a slice of bytes to a string slice without checking +/// that the string contains valid UTF-8; mutable version. +/// +/// See the immutable version, [`from_utf8_unchecked()`] for more information. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::str; +/// +/// let mut heart = vec![240, 159, 146, 150]; +/// let heart = unsafe { str::from_utf8_unchecked_mut(&mut heart) }; +/// +/// assert_eq!("💖", heart); +/// ``` +#[inline] +#[stable(feature = "str_mut_extras", since = "1.20.0")] +pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str { + // SAFETY: the caller must guarantee that the bytes `v` + // are valid UTF-8, thus the cast to `*mut str` is safe. + // Also, the pointer dereference is safe because that pointer + // comes from a reference which is guaranteed to be valid for writes. + unsafe { &mut *(v as *mut [u8] as *mut str) } +} diff --git a/library/core/src/str/error.rs b/library/core/src/str/error.rs index 43b790a4aca..427f720d68c 100644 --- a/library/core/src/str/error.rs +++ b/library/core/src/str/error.rs @@ -9,7 +9,7 @@ use crate::fmt; /// and [`&str`]s make use of this error, for example. /// /// [`String`]: ../../std/string/struct.String.html#method.from_utf8 -/// [`&str`]: from_utf8 +/// [`&str`]: super::from_utf8 /// /// # Examples /// @@ -114,7 +114,7 @@ impl fmt::Display for Utf8Error { /// An error returned when parsing a `bool` using [`from_str`] fails /// -/// [`from_str`]: FromStr::from_str +/// [`from_str`]: super::FromStr::from_str #[derive(Debug, Clone, PartialEq, Eq)] #[stable(feature = "rust1", since = "1.0.0")] pub struct ParseBoolError { diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index ab9bec2fd2d..ada5a4fa39f 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -8,6 +8,7 @@ #![stable(feature = "rust1", since = "1.0.0")] +mod converts; mod error; mod iter; mod traits; @@ -26,6 +27,12 @@ pub mod pattern; #[allow(missing_docs)] pub mod lossy; +#[stable(feature = "rust1", since = "1.0.0")] +pub use converts::{from_utf8, from_utf8_unchecked}; + +#[stable(feature = "str_mut_extras", since = "1.20.0")] +pub use converts::{from_utf8_mut, from_utf8_unchecked_mut}; + #[stable(feature = "rust1", since = "1.0.0")] pub use error::{ParseBoolError, Utf8Error}; @@ -70,198 +77,7 @@ use iter::MatchIndicesInternal; use iter::SplitInternal; use iter::{MatchesInternal, SplitNInternal}; -use validations::{run_utf8_validation, truncate_to_char_boundary}; - -/* -Section: Creating a string -*/ - -/// Converts a slice of bytes to a string slice. -/// -/// A string slice ([`&str`]) is made of bytes ([`u8`]), and a byte slice -/// ([`&[u8]`][byteslice]) is made of bytes, so this function converts between -/// the two. Not all byte slices are valid string slices, however: [`&str`] requires -/// that it is valid UTF-8. `from_utf8()` checks to ensure that the bytes are valid -/// UTF-8, and then does the conversion. -/// -/// [`&str`]: str -/// [byteslice]: ../../std/primitive.slice.html -/// -/// If you are sure that the byte slice is valid UTF-8, and you don't want to -/// incur the overhead of the validity check, there is an unsafe version of -/// this function, [`from_utf8_unchecked`], which has the same -/// behavior but skips the check. -/// -/// If you need a `String` instead of a `&str`, consider -/// [`String::from_utf8`][string]. -/// -/// [string]: ../../std/string/struct.String.html#method.from_utf8 -/// -/// Because you can stack-allocate a `[u8; N]`, and you can take a -/// [`&[u8]`][byteslice] of it, this function is one way to have a -/// stack-allocated string. There is an example of this in the -/// examples section below. -/// -/// [byteslice]: ../../std/primitive.slice.html -/// -/// # Errors -/// -/// Returns `Err` if the slice is not UTF-8 with a description as to why the -/// provided slice is not UTF-8. -/// -/// # Examples -/// -/// Basic usage: -/// -/// ``` -/// use std::str; -/// -/// // some bytes, in a vector -/// let sparkle_heart = vec![240, 159, 146, 150]; -/// -/// // We know these bytes are valid, so just use `unwrap()`. -/// let sparkle_heart = str::from_utf8(&sparkle_heart).unwrap(); -/// -/// assert_eq!("💖", sparkle_heart); -/// ``` -/// -/// Incorrect bytes: -/// -/// ``` -/// use std::str; -/// -/// // some invalid bytes, in a vector -/// let sparkle_heart = vec![0, 159, 146, 150]; -/// -/// assert!(str::from_utf8(&sparkle_heart).is_err()); -/// ``` -/// -/// See the docs for [`Utf8Error`] for more details on the kinds of -/// errors that can be returned. -/// -/// A "stack allocated string": -/// -/// ``` -/// use std::str; -/// -/// // some bytes, in a stack-allocated array -/// let sparkle_heart = [240, 159, 146, 150]; -/// -/// // We know these bytes are valid, so just use `unwrap()`. -/// let sparkle_heart = str::from_utf8(&sparkle_heart).unwrap(); -/// -/// assert_eq!("💖", sparkle_heart); -/// ``` -#[stable(feature = "rust1", since = "1.0.0")] -pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { - run_utf8_validation(v)?; - // SAFETY: Just ran validation. - Ok(unsafe { from_utf8_unchecked(v) }) -} - -/// Converts a mutable slice of bytes to a mutable string slice. -/// -/// # Examples -/// -/// Basic usage: -/// -/// ``` -/// use std::str; -/// -/// // "Hello, Rust!" as a mutable vector -/// let mut hellorust = vec![72, 101, 108, 108, 111, 44, 32, 82, 117, 115, 116, 33]; -/// -/// // As we know these bytes are valid, we can use `unwrap()` -/// let outstr = str::from_utf8_mut(&mut hellorust).unwrap(); -/// -/// assert_eq!("Hello, Rust!", outstr); -/// ``` -/// -/// Incorrect bytes: -/// -/// ``` -/// use std::str; -/// -/// // Some invalid bytes in a mutable vector -/// let mut invalid = vec![128, 223]; -/// -/// assert!(str::from_utf8_mut(&mut invalid).is_err()); -/// ``` -/// See the docs for [`Utf8Error`] for more details on the kinds of -/// errors that can be returned. -#[stable(feature = "str_mut_extras", since = "1.20.0")] -pub fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> { - run_utf8_validation(v)?; - // SAFETY: Just ran validation. - Ok(unsafe { from_utf8_unchecked_mut(v) }) -} - -/// Converts a slice of bytes to a string slice without checking -/// that the string contains valid UTF-8. -/// -/// See the safe version, [`from_utf8`], for more information. -/// -/// # Safety -/// -/// This function is unsafe because it does not check that the bytes passed to -/// it are valid UTF-8. If this constraint is violated, undefined behavior -/// results, as the rest of Rust assumes that [`&str`]s are valid UTF-8. -/// -/// [`&str`]: str -/// -/// # Examples -/// -/// Basic usage: -/// -/// ``` -/// use std::str; -/// -/// // some bytes, in a vector -/// let sparkle_heart = vec![240, 159, 146, 150]; -/// -/// let sparkle_heart = unsafe { -/// str::from_utf8_unchecked(&sparkle_heart) -/// }; -/// -/// assert_eq!("💖", sparkle_heart); -/// ``` -#[inline] -#[stable(feature = "rust1", since = "1.0.0")] -#[rustc_const_unstable(feature = "const_str_from_utf8_unchecked", issue = "75196")] -#[allow(unused_attributes)] -#[allow_internal_unstable(const_fn_transmute)] -pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str { - // SAFETY: the caller must guarantee that the bytes `v` are valid UTF-8. - // Also relies on `&str` and `&[u8]` having the same layout. - unsafe { mem::transmute(v) } -} - -/// Converts a slice of bytes to a string slice without checking -/// that the string contains valid UTF-8; mutable version. -/// -/// See the immutable version, [`from_utf8_unchecked()`] for more information. -/// -/// # Examples -/// -/// Basic usage: -/// -/// ``` -/// use std::str; -/// -/// let mut heart = vec![240, 159, 146, 150]; -/// let heart = unsafe { str::from_utf8_unchecked_mut(&mut heart) }; -/// -/// assert_eq!("💖", heart); -/// ``` -#[inline] -#[stable(feature = "str_mut_extras", since = "1.20.0")] -pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str { - // SAFETY: the caller must guarantee that the bytes `v` - // are valid UTF-8, thus the cast to `*mut str` is safe. - // Also, the pointer dereference is safe because that pointer - // comes from a reference which is guaranteed to be valid for writes. - unsafe { &mut *(v as *mut [u8] as *mut str) } -} +use validations::truncate_to_char_boundary; impl_fn_for_zst! { /// A nameable, cloneable fn type From 37cd79cd323a7f1adcad66205c7855fb6c92f062 Mon Sep 17 00:00:00 2001 From: Lzu Tao Date: Fri, 4 Sep 2020 09:40:20 +0000 Subject: [PATCH 6/7] Gather all ZST structs of str together --- library/core/src/str/mod.rs | 48 +++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index ada5a4fa39f..f30e4786970 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -79,16 +79,6 @@ use iter::{MatchesInternal, SplitNInternal}; use validations::truncate_to_char_boundary; -impl_fn_for_zst! { - /// A nameable, cloneable fn type - #[derive(Clone)] - struct LinesAnyMap impl<'a> Fn = |line: &'a str| -> &'a str { - let l = line.len(); - if l > 0 && line.as_bytes()[l - 1] == b'\r' { &line[0 .. l - 1] } - else { line } - }; -} - #[inline(never)] #[cold] #[track_caller] @@ -2425,22 +2415,6 @@ impl str { } } -impl_fn_for_zst! { - #[derive(Clone)] - struct CharEscapeDebugContinue impl Fn = |c: char| -> char::EscapeDebug { - c.escape_debug_ext(false) - }; - - #[derive(Clone)] - struct CharEscapeUnicode impl Fn = |c: char| -> char::EscapeUnicode { - c.escape_unicode() - }; - #[derive(Clone)] - struct CharEscapeDefault impl Fn = |c: char| -> char::EscapeDefault { - c.escape_default() - }; -} - #[stable(feature = "rust1", since = "1.0.0")] impl AsRef<[u8]> for str { #[inline] @@ -2467,6 +2441,28 @@ impl Default for &mut str { } impl_fn_for_zst! { + /// A nameable, cloneable fn type + #[derive(Clone)] + struct LinesAnyMap impl<'a> Fn = |line: &'a str| -> &'a str { + let l = line.len(); + if l > 0 && line.as_bytes()[l - 1] == b'\r' { &line[0 .. l - 1] } + else { line } + }; + + #[derive(Clone)] + struct CharEscapeDebugContinue impl Fn = |c: char| -> char::EscapeDebug { + c.escape_debug_ext(false) + }; + + #[derive(Clone)] + struct CharEscapeUnicode impl Fn = |c: char| -> char::EscapeUnicode { + c.escape_unicode() + }; + #[derive(Clone)] + struct CharEscapeDefault impl Fn = |c: char| -> char::EscapeDefault { + c.escape_default() + }; + #[derive(Clone)] struct IsWhitespace impl Fn = |c: char| -> bool { c.is_whitespace() From dce7248a39e7f8907cb9c5cfe719f6f63da1fa1f Mon Sep 17 00:00:00 2001 From: Lzu Tao Date: Fri, 4 Sep 2020 09:41:03 +0000 Subject: [PATCH 7/7] Remove unneeded tidy comment --- library/core/src/str/mod.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index f30e4786970..3e18a4e7062 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -1,5 +1,3 @@ -// ignore-tidy-filelength - //! String manipulation. //! //! For more details, see the [`std::str`] module.