Auto merge of #76325 - lzutao:split-core-str, r=Amanieu

Split core/str/mod.rs to smaller files

Note for reviewer:
* I split to multiple commits for easier reviewing, but I could git squash them all to one if requested.
* Recommend pulling this change locally and using advanced git diff viewer or this command:
  ```bash
  git show --reverse --color-moved=dimmed-zebra --color-moved-ws=ignore-all-space master..
  ```

---

I split `core/str/mod.rs` to these modules:

* `converts`: Contains helper functions to convert from bytes to str.
* `error`: For error structs like Utf8Error.
* `iter`: For iterators of many str methods.
* `traits`: For indexing operations and build in traits on str.
* `validations`: For functions validating utf8 --- This name is awkward, maybe utf8.rs is better.
This commit is contained in:
bors 2020-09-30 23:04:16 +00:00
commit 9bb55dc864
7 changed files with 2519 additions and 2457 deletions

View File

@ -0,0 +1,192 @@
//! Ways to create a `str` from bytes slice.
use crate::mem;
use super::validations::run_utf8_validation;
use super::Utf8Error;
/// Converts a slice of bytes to a string slice.
///
/// A string slice ([`&str`]) is made of bytes ([`u8`]), and a byte slice
/// ([`&[u8]`][byteslice]) is made of bytes, so this function converts between
/// the two. Not all byte slices are valid string slices, however: [`&str`] requires
/// that it is valid UTF-8. `from_utf8()` checks to ensure that the bytes are valid
/// UTF-8, and then does the conversion.
///
/// [`&str`]: str
/// [byteslice]: ../../std/primitive.slice.html
///
/// If you are sure that the byte slice is valid UTF-8, and you don't want to
/// incur the overhead of the validity check, there is an unsafe version of
/// this function, [`from_utf8_unchecked`], which has the same
/// behavior but skips the check.
///
/// If you need a `String` instead of a `&str`, consider
/// [`String::from_utf8`][string].
///
/// [string]: ../../std/string/struct.String.html#method.from_utf8
///
/// Because you can stack-allocate a `[u8; N]`, and you can take a
/// [`&[u8]`][byteslice] of it, this function is one way to have a
/// stack-allocated string. There is an example of this in the
/// examples section below.
///
/// [byteslice]: ../../std/primitive.slice.html
///
/// # Errors
///
/// Returns `Err` if the slice is not UTF-8 with a description as to why the
/// provided slice is not UTF-8.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::str;
///
/// // some bytes, in a vector
/// let sparkle_heart = vec![240, 159, 146, 150];
///
/// // We know these bytes are valid, so just use `unwrap()`.
/// let sparkle_heart = str::from_utf8(&sparkle_heart).unwrap();
///
/// assert_eq!("💖", sparkle_heart);
/// ```
///
/// Incorrect bytes:
///
/// ```
/// use std::str;
///
/// // some invalid bytes, in a vector
/// let sparkle_heart = vec![0, 159, 146, 150];
///
/// assert!(str::from_utf8(&sparkle_heart).is_err());
/// ```
///
/// See the docs for [`Utf8Error`] for more details on the kinds of
/// errors that can be returned.
///
/// A "stack allocated string":
///
/// ```
/// use std::str;
///
/// // some bytes, in a stack-allocated array
/// let sparkle_heart = [240, 159, 146, 150];
///
/// // We know these bytes are valid, so just use `unwrap()`.
/// let sparkle_heart = str::from_utf8(&sparkle_heart).unwrap();
///
/// assert_eq!("💖", sparkle_heart);
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
run_utf8_validation(v)?;
// SAFETY: Just ran validation.
Ok(unsafe { from_utf8_unchecked(v) })
}
/// Converts a mutable slice of bytes to a mutable string slice.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::str;
///
/// // "Hello, Rust!" as a mutable vector
/// let mut hellorust = vec![72, 101, 108, 108, 111, 44, 32, 82, 117, 115, 116, 33];
///
/// // As we know these bytes are valid, we can use `unwrap()`
/// let outstr = str::from_utf8_mut(&mut hellorust).unwrap();
///
/// assert_eq!("Hello, Rust!", outstr);
/// ```
///
/// Incorrect bytes:
///
/// ```
/// use std::str;
///
/// // Some invalid bytes in a mutable vector
/// let mut invalid = vec![128, 223];
///
/// assert!(str::from_utf8_mut(&mut invalid).is_err());
/// ```
/// See the docs for [`Utf8Error`] for more details on the kinds of
/// errors that can be returned.
#[stable(feature = "str_mut_extras", since = "1.20.0")]
pub fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> {
run_utf8_validation(v)?;
// SAFETY: Just ran validation.
Ok(unsafe { from_utf8_unchecked_mut(v) })
}
/// Converts a slice of bytes to a string slice without checking
/// that the string contains valid UTF-8.
///
/// See the safe version, [`from_utf8`], for more information.
///
/// # Safety
///
/// This function is unsafe because it does not check that the bytes passed to
/// it are valid UTF-8. If this constraint is violated, undefined behavior
/// results, as the rest of Rust assumes that [`&str`]s are valid UTF-8.
///
/// [`&str`]: str
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::str;
///
/// // some bytes, in a vector
/// let sparkle_heart = vec![240, 159, 146, 150];
///
/// let sparkle_heart = unsafe {
/// str::from_utf8_unchecked(&sparkle_heart)
/// };
///
/// assert_eq!("💖", sparkle_heart);
/// ```
#[inline]
#[stable(feature = "rust1", since = "1.0.0")]
#[rustc_const_unstable(feature = "const_str_from_utf8_unchecked", issue = "75196")]
#[allow_internal_unstable(const_fn_transmute)]
pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
// SAFETY: the caller must guarantee that the bytes `v` are valid UTF-8.
// Also relies on `&str` and `&[u8]` having the same layout.
unsafe { mem::transmute(v) }
}
/// Converts a slice of bytes to a string slice without checking
/// that the string contains valid UTF-8; mutable version.
///
/// See the immutable version, [`from_utf8_unchecked()`] for more information.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::str;
///
/// let mut heart = vec![240, 159, 146, 150];
/// let heart = unsafe { str::from_utf8_unchecked_mut(&mut heart) };
///
/// assert_eq!("💖", heart);
/// ```
#[inline]
#[stable(feature = "str_mut_extras", since = "1.20.0")]
pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str {
// SAFETY: the caller must guarantee that the bytes `v`
// are valid UTF-8, thus the cast to `*mut str` is safe.
// Also, the pointer dereference is safe because that pointer
// comes from a reference which is guaranteed to be valid for writes.
unsafe { &mut *(v as *mut [u8] as *mut str) }
}

View File

@ -0,0 +1,129 @@
//! Defines utf8 error type.
use crate::fmt;
/// Errors which can occur when attempting to interpret a sequence of [`u8`]
/// as a string.
///
/// As such, the `from_utf8` family of functions and methods for both [`String`]s
/// and [`&str`]s make use of this error, for example.
///
/// [`String`]: ../../std/string/struct.String.html#method.from_utf8
/// [`&str`]: super::from_utf8
///
/// # Examples
///
/// This error types methods can be used to create functionality
/// similar to `String::from_utf8_lossy` without allocating heap memory:
///
/// ```
/// fn from_utf8_lossy<F>(mut input: &[u8], mut push: F) where F: FnMut(&str) {
/// loop {
/// match std::str::from_utf8(input) {
/// Ok(valid) => {
/// push(valid);
/// break
/// }
/// Err(error) => {
/// let (valid, after_valid) = input.split_at(error.valid_up_to());
/// unsafe {
/// push(std::str::from_utf8_unchecked(valid))
/// }
/// push("\u{FFFD}");
///
/// if let Some(invalid_sequence_length) = error.error_len() {
/// input = &after_valid[invalid_sequence_length..]
/// } else {
/// break
/// }
/// }
/// }
/// }
/// }
/// ```
#[derive(Copy, Eq, PartialEq, Clone, Debug)]
#[stable(feature = "rust1", since = "1.0.0")]
pub struct Utf8Error {
pub(super) valid_up_to: usize,
pub(super) error_len: Option<u8>,
}
impl Utf8Error {
/// Returns the index in the given string up to which valid UTF-8 was
/// verified.
///
/// It is the maximum index such that `from_utf8(&input[..index])`
/// would return `Ok(_)`.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::str;
///
/// // some invalid bytes, in a vector
/// let sparkle_heart = vec![0, 159, 146, 150];
///
/// // std::str::from_utf8 returns a Utf8Error
/// let error = str::from_utf8(&sparkle_heart).unwrap_err();
///
/// // the second byte is invalid here
/// assert_eq!(1, error.valid_up_to());
/// ```
#[stable(feature = "utf8_error", since = "1.5.0")]
pub fn valid_up_to(&self) -> usize {
self.valid_up_to
}
/// Provides more information about the failure:
///
/// * `None`: the end of the input was reached unexpectedly.
/// `self.valid_up_to()` is 1 to 3 bytes from the end of the input.
/// If a byte stream (such as a file or a network socket) is being decoded incrementally,
/// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks.
///
/// * `Some(len)`: an unexpected byte was encountered.
/// The length provided is that of the invalid byte sequence
/// that starts at the index given by `valid_up_to()`.
/// Decoding should resume after that sequence
/// (after inserting a [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD]) in case of
/// lossy decoding.
///
/// [U+FFFD]: ../../std/char/constant.REPLACEMENT_CHARACTER.html
#[stable(feature = "utf8_error_error_len", since = "1.20.0")]
pub fn error_len(&self) -> Option<usize> {
self.error_len.map(|len| len as usize)
}
}
#[stable(feature = "rust1", since = "1.0.0")]
impl fmt::Display for Utf8Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if let Some(error_len) = self.error_len {
write!(
f,
"invalid utf-8 sequence of {} bytes from index {}",
error_len, self.valid_up_to
)
} else {
write!(f, "incomplete utf-8 byte sequence from index {}", self.valid_up_to)
}
}
}
/// An error returned when parsing a `bool` using [`from_str`] fails
///
/// [`from_str`]: super::FromStr::from_str
#[derive(Debug, Clone, PartialEq, Eq)]
#[stable(feature = "rust1", since = "1.0.0")]
pub struct ParseBoolError {
pub(super) _priv: (),
}
#[stable(feature = "rust1", since = "1.0.0")]
impl fmt::Display for ParseBoolError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
"provided string was not `true` or `false`".fmt(f)
}
}

1255
library/core/src/str/iter.rs Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,9 @@
use crate::char;
use crate::fmt::{self, Write};
use crate::mem;
use crate::str as core_str;
use super::from_utf8_unchecked;
use super::validations::utf8_char_width;
/// Lossy UTF-8 string.
#[unstable(feature = "str_internals", issue = "none")]
@ -66,14 +68,14 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
if byte < 128 {
} else {
let w = core_str::utf8_char_width(byte);
let w = utf8_char_width(byte);
macro_rules! error {
() => {{
// SAFETY: We have checked up to `i` that source is valid UTF-8.
unsafe {
let r = Utf8LossyChunk {
valid: core_str::from_utf8_unchecked(&self.source[0..i_]),
valid: from_utf8_unchecked(&self.source[0..i_]),
broken: &self.source[i_..i],
};
self.source = &self.source[i..];
@ -133,7 +135,7 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
let r = Utf8LossyChunk {
// SAFETY: We have checked that the entire source is valid UTF-8.
valid: unsafe { core_str::from_utf8_unchecked(self.source) },
valid: unsafe { from_utf8_unchecked(self.source) },
broken: &[],
};
self.source = &[];

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,597 @@
//! Trait implementations for `str`.
use crate::cmp::Ordering;
use crate::ops;
use crate::ptr;
use crate::slice::SliceIndex;
use super::ParseBoolError;
/// Implements ordering of strings.
///
/// Strings are ordered lexicographically by their byte values. This orders Unicode code
/// points based on their positions in the code charts. This is not necessarily the same as
/// "alphabetical" order, which varies by language and locale. Sorting strings according to
/// culturally-accepted standards requires locale-specific data that is outside the scope of
/// the `str` type.
#[stable(feature = "rust1", since = "1.0.0")]
impl Ord for str {
#[inline]
fn cmp(&self, other: &str) -> Ordering {
self.as_bytes().cmp(other.as_bytes())
}
}
#[stable(feature = "rust1", since = "1.0.0")]
impl PartialEq for str {
#[inline]
fn eq(&self, other: &str) -> bool {
self.as_bytes() == other.as_bytes()
}
#[inline]
fn ne(&self, other: &str) -> bool {
!(*self).eq(other)
}
}
#[stable(feature = "rust1", since = "1.0.0")]
impl Eq for str {}
/// Implements comparison operations on strings.
///
/// Strings are compared lexicographically by their byte values. This compares Unicode code
/// points based on their positions in the code charts. This is not necessarily the same as
/// "alphabetical" order, which varies by language and locale. Comparing strings according to
/// culturally-accepted standards requires locale-specific data that is outside the scope of
/// the `str` type.
#[stable(feature = "rust1", since = "1.0.0")]
impl PartialOrd for str {
#[inline]
fn partial_cmp(&self, other: &str) -> Option<Ordering> {
Some(self.cmp(other))
}
}
#[stable(feature = "rust1", since = "1.0.0")]
impl<I> ops::Index<I> for str
where
I: SliceIndex<str>,
{
type Output = I::Output;
#[inline]
fn index(&self, index: I) -> &I::Output {
index.index(self)
}
}
#[stable(feature = "rust1", since = "1.0.0")]
impl<I> ops::IndexMut<I> for str
where
I: SliceIndex<str>,
{
#[inline]
fn index_mut(&mut self, index: I) -> &mut I::Output {
index.index_mut(self)
}
}
#[inline(never)]
#[cold]
#[track_caller]
fn str_index_overflow_fail() -> ! {
panic!("attempted to index str up to maximum usize");
}
/// Implements substring slicing with syntax `&self[..]` or `&mut self[..]`.
///
/// Returns a slice of the whole string, i.e., returns `&self` or `&mut
/// self`. Equivalent to `&self[0 .. len]` or `&mut self[0 .. len]`. Unlike
/// other indexing operations, this can never panic.
///
/// This operation is `O(1)`.
///
/// Prior to 1.20.0, these indexing operations were still supported by
/// direct implementation of `Index` and `IndexMut`.
///
/// Equivalent to `&self[0 .. len]` or `&mut self[0 .. len]`.
#[stable(feature = "str_checked_slicing", since = "1.20.0")]
unsafe impl SliceIndex<str> for ops::RangeFull {
type Output = str;
#[inline]
fn get(self, slice: &str) -> Option<&Self::Output> {
Some(slice)
}
#[inline]
fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> {
Some(slice)
}
#[inline]
unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output {
slice
}
#[inline]
unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output {
slice
}
#[inline]
fn index(self, slice: &str) -> &Self::Output {
slice
}
#[inline]
fn index_mut(self, slice: &mut str) -> &mut Self::Output {
slice
}
}
/// Implements substring slicing with syntax `&self[begin .. end]` or `&mut
/// self[begin .. end]`.
///
/// Returns a slice of the given string from the byte range
/// [`begin`, `end`).
///
/// This operation is `O(1)`.
///
/// Prior to 1.20.0, these indexing operations were still supported by
/// direct implementation of `Index` and `IndexMut`.
///
/// # Panics
///
/// Panics if `begin` or `end` does not point to the starting byte offset of
/// a character (as defined by `is_char_boundary`), if `begin > end`, or if
/// `end > len`.
///
/// # Examples
///
/// ```
/// let s = "Löwe 老虎 Léopard";
/// assert_eq!(&s[0 .. 1], "L");
///
/// assert_eq!(&s[1 .. 9], "öwe 老");
///
/// // these will panic:
/// // byte 2 lies within `ö`:
/// // &s[2 ..3];
///
/// // byte 8 lies within `老`
/// // &s[1 .. 8];
///
/// // byte 100 is outside the string
/// // &s[3 .. 100];
/// ```
#[stable(feature = "str_checked_slicing", since = "1.20.0")]
unsafe impl SliceIndex<str> for ops::Range<usize> {
type Output = str;
#[inline]
fn get(self, slice: &str) -> Option<&Self::Output> {
if self.start <= self.end
&& slice.is_char_boundary(self.start)
&& slice.is_char_boundary(self.end)
{
// SAFETY: just checked that `start` and `end` are on a char boundary,
// and we are passing in a safe reference, so the return value will also be one.
// We also checked char boundaries, so this is valid UTF-8.
Some(unsafe { &*self.get_unchecked(slice) })
} else {
None
}
}
#[inline]
fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> {
if self.start <= self.end
&& slice.is_char_boundary(self.start)
&& slice.is_char_boundary(self.end)
{
// SAFETY: just checked that `start` and `end` are on a char boundary.
// We know the pointer is unique because we got it from `slice`.
Some(unsafe { &mut *self.get_unchecked_mut(slice) })
} else {
None
}
}
#[inline]
unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output {
let slice = slice as *const [u8];
// SAFETY: the caller guarantees that `self` is in bounds of `slice`
// which satisfies all the conditions for `add`.
let ptr = unsafe { slice.as_ptr().add(self.start) };
let len = self.end - self.start;
ptr::slice_from_raw_parts(ptr, len) as *const str
}
#[inline]
unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output {
let slice = slice as *mut [u8];
// SAFETY: see comments for `get_unchecked`.
let ptr = unsafe { slice.as_mut_ptr().add(self.start) };
let len = self.end - self.start;
ptr::slice_from_raw_parts_mut(ptr, len) as *mut str
}
#[inline]
fn index(self, slice: &str) -> &Self::Output {
let (start, end) = (self.start, self.end);
match self.get(slice) {
Some(s) => s,
None => super::slice_error_fail(slice, start, end),
}
}
#[inline]
fn index_mut(self, slice: &mut str) -> &mut Self::Output {
// is_char_boundary checks that the index is in [0, .len()]
// cannot reuse `get` as above, because of NLL trouble
if self.start <= self.end
&& slice.is_char_boundary(self.start)
&& slice.is_char_boundary(self.end)
{
// SAFETY: just checked that `start` and `end` are on a char boundary,
// and we are passing in a safe reference, so the return value will also be one.
unsafe { &mut *self.get_unchecked_mut(slice) }
} else {
super::slice_error_fail(slice, self.start, self.end)
}
}
}
/// Implements substring slicing with syntax `&self[.. end]` or `&mut
/// self[.. end]`.
///
/// Returns a slice of the given string from the byte range [`0`, `end`).
/// Equivalent to `&self[0 .. end]` or `&mut self[0 .. end]`.
///
/// This operation is `O(1)`.
///
/// Prior to 1.20.0, these indexing operations were still supported by
/// direct implementation of `Index` and `IndexMut`.
///
/// # Panics
///
/// Panics if `end` does not point to the starting byte offset of a
/// character (as defined by `is_char_boundary`), or if `end > len`.
#[stable(feature = "str_checked_slicing", since = "1.20.0")]
unsafe impl SliceIndex<str> for ops::RangeTo<usize> {
type Output = str;
#[inline]
fn get(self, slice: &str) -> Option<&Self::Output> {
if slice.is_char_boundary(self.end) {
// SAFETY: just checked that `end` is on a char boundary,
// and we are passing in a safe reference, so the return value will also be one.
Some(unsafe { &*self.get_unchecked(slice) })
} else {
None
}
}
#[inline]
fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> {
if slice.is_char_boundary(self.end) {
// SAFETY: just checked that `end` is on a char boundary,
// and we are passing in a safe reference, so the return value will also be one.
Some(unsafe { &mut *self.get_unchecked_mut(slice) })
} else {
None
}
}
#[inline]
unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output {
let slice = slice as *const [u8];
let ptr = slice.as_ptr();
ptr::slice_from_raw_parts(ptr, self.end) as *const str
}
#[inline]
unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output {
let slice = slice as *mut [u8];
let ptr = slice.as_mut_ptr();
ptr::slice_from_raw_parts_mut(ptr, self.end) as *mut str
}
#[inline]
fn index(self, slice: &str) -> &Self::Output {
let end = self.end;
match self.get(slice) {
Some(s) => s,
None => super::slice_error_fail(slice, 0, end),
}
}
#[inline]
fn index_mut(self, slice: &mut str) -> &mut Self::Output {
if slice.is_char_boundary(self.end) {
// SAFETY: just checked that `end` is on a char boundary,
// and we are passing in a safe reference, so the return value will also be one.
unsafe { &mut *self.get_unchecked_mut(slice) }
} else {
super::slice_error_fail(slice, 0, self.end)
}
}
}
/// Implements substring slicing with syntax `&self[begin ..]` or `&mut
/// self[begin ..]`.
///
/// Returns a slice of the given string from the byte range [`begin`,
/// `len`). Equivalent to `&self[begin .. len]` or `&mut self[begin ..
/// len]`.
///
/// This operation is `O(1)`.
///
/// Prior to 1.20.0, these indexing operations were still supported by
/// direct implementation of `Index` and `IndexMut`.
///
/// # Panics
///
/// Panics if `begin` does not point to the starting byte offset of
/// a character (as defined by `is_char_boundary`), or if `begin > len`.
#[stable(feature = "str_checked_slicing", since = "1.20.0")]
unsafe impl SliceIndex<str> for ops::RangeFrom<usize> {
type Output = str;
#[inline]
fn get(self, slice: &str) -> Option<&Self::Output> {
if slice.is_char_boundary(self.start) {
// SAFETY: just checked that `start` is on a char boundary,
// and we are passing in a safe reference, so the return value will also be one.
Some(unsafe { &*self.get_unchecked(slice) })
} else {
None
}
}
#[inline]
fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> {
if slice.is_char_boundary(self.start) {
// SAFETY: just checked that `start` is on a char boundary,
// and we are passing in a safe reference, so the return value will also be one.
Some(unsafe { &mut *self.get_unchecked_mut(slice) })
} else {
None
}
}
#[inline]
unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output {
let slice = slice as *const [u8];
// SAFETY: the caller guarantees that `self` is in bounds of `slice`
// which satisfies all the conditions for `add`.
let ptr = unsafe { slice.as_ptr().add(self.start) };
let len = slice.len() - self.start;
ptr::slice_from_raw_parts(ptr, len) as *const str
}
#[inline]
unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output {
let slice = slice as *mut [u8];
// SAFETY: identical to `get_unchecked`.
let ptr = unsafe { slice.as_mut_ptr().add(self.start) };
let len = slice.len() - self.start;
ptr::slice_from_raw_parts_mut(ptr, len) as *mut str
}
#[inline]
fn index(self, slice: &str) -> &Self::Output {
let (start, end) = (self.start, slice.len());
match self.get(slice) {
Some(s) => s,
None => super::slice_error_fail(slice, start, end),
}
}
#[inline]
fn index_mut(self, slice: &mut str) -> &mut Self::Output {
if slice.is_char_boundary(self.start) {
// SAFETY: just checked that `start` is on a char boundary,
// and we are passing in a safe reference, so the return value will also be one.
unsafe { &mut *self.get_unchecked_mut(slice) }
} else {
super::slice_error_fail(slice, self.start, slice.len())
}
}
}
/// Implements substring slicing with syntax `&self[begin ..= end]` or `&mut
/// self[begin ..= end]`.
///
/// Returns a slice of the given string from the byte range
/// [`begin`, `end`]. Equivalent to `&self [begin .. end + 1]` or `&mut
/// self[begin .. end + 1]`, except if `end` has the maximum value for
/// `usize`.
///
/// This operation is `O(1)`.
///
/// # Panics
///
/// Panics if `begin` does not point to the starting byte offset of
/// a character (as defined by `is_char_boundary`), if `end` does not point
/// to the ending byte offset of a character (`end + 1` is either a starting
/// byte offset or equal to `len`), if `begin > end`, or if `end >= len`.
#[stable(feature = "inclusive_range", since = "1.26.0")]
unsafe impl SliceIndex<str> for ops::RangeInclusive<usize> {
type Output = str;
#[inline]
fn get(self, slice: &str) -> Option<&Self::Output> {
if *self.end() == usize::MAX { None } else { (*self.start()..self.end() + 1).get(slice) }
}
#[inline]
fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> {
if *self.end() == usize::MAX {
None
} else {
(*self.start()..self.end() + 1).get_mut(slice)
}
}
#[inline]
unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output {
// SAFETY: the caller must uphold the safety contract for `get_unchecked`.
unsafe { (*self.start()..self.end() + 1).get_unchecked(slice) }
}
#[inline]
unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output {
// SAFETY: the caller must uphold the safety contract for `get_unchecked_mut`.
unsafe { (*self.start()..self.end() + 1).get_unchecked_mut(slice) }
}
#[inline]
fn index(self, slice: &str) -> &Self::Output {
if *self.end() == usize::MAX {
str_index_overflow_fail();
}
(*self.start()..self.end() + 1).index(slice)
}
#[inline]
fn index_mut(self, slice: &mut str) -> &mut Self::Output {
if *self.end() == usize::MAX {
str_index_overflow_fail();
}
(*self.start()..self.end() + 1).index_mut(slice)
}
}
/// Implements substring slicing with syntax `&self[..= end]` or `&mut
/// self[..= end]`.
///
/// Returns a slice of the given string from the byte range [0, `end`].
/// Equivalent to `&self [0 .. end + 1]`, except if `end` has the maximum
/// value for `usize`.
///
/// This operation is `O(1)`.
///
/// # Panics
///
/// Panics if `end` does not point to the ending byte offset of a character
/// (`end + 1` is either a starting byte offset as defined by
/// `is_char_boundary`, or equal to `len`), or if `end >= len`.
#[stable(feature = "inclusive_range", since = "1.26.0")]
unsafe impl SliceIndex<str> for ops::RangeToInclusive<usize> {
type Output = str;
#[inline]
fn get(self, slice: &str) -> Option<&Self::Output> {
if self.end == usize::MAX { None } else { (..self.end + 1).get(slice) }
}
#[inline]
fn get_mut(self, slice: &mut str) -> Option<&mut Self::Output> {
if self.end == usize::MAX { None } else { (..self.end + 1).get_mut(slice) }
}
#[inline]
unsafe fn get_unchecked(self, slice: *const str) -> *const Self::Output {
// SAFETY: the caller must uphold the safety contract for `get_unchecked`.
unsafe { (..self.end + 1).get_unchecked(slice) }
}
#[inline]
unsafe fn get_unchecked_mut(self, slice: *mut str) -> *mut Self::Output {
// SAFETY: the caller must uphold the safety contract for `get_unchecked_mut`.
unsafe { (..self.end + 1).get_unchecked_mut(slice) }
}
#[inline]
fn index(self, slice: &str) -> &Self::Output {
if self.end == usize::MAX {
str_index_overflow_fail();
}
(..self.end + 1).index(slice)
}
#[inline]
fn index_mut(self, slice: &mut str) -> &mut Self::Output {
if self.end == usize::MAX {
str_index_overflow_fail();
}
(..self.end + 1).index_mut(slice)
}
}
/// Parse a value from a string
///
/// `FromStr`'s [`from_str`] method is often used implicitly, through
/// [`str`]'s [`parse`] method. See [`parse`]'s documentation for examples.
///
/// [`from_str`]: FromStr::from_str
/// [`parse`]: str::parse
///
/// `FromStr` does not have a lifetime parameter, and so you can only parse types
/// that do not contain a lifetime parameter themselves. In other words, you can
/// parse an `i32` with `FromStr`, but not a `&i32`. You can parse a struct that
/// contains an `i32`, but not one that contains an `&i32`.
///
/// # Examples
///
/// Basic implementation of `FromStr` on an example `Point` type:
///
/// ```
/// use std::str::FromStr;
/// use std::num::ParseIntError;
///
/// #[derive(Debug, PartialEq)]
/// struct Point {
/// x: i32,
/// y: i32
/// }
///
/// impl FromStr for Point {
/// type Err = ParseIntError;
///
/// fn from_str(s: &str) -> Result<Self, Self::Err> {
/// let coords: Vec<&str> = s.trim_matches(|p| p == '(' || p == ')' )
/// .split(',')
/// .collect();
///
/// let x_fromstr = coords[0].parse::<i32>()?;
/// let y_fromstr = coords[1].parse::<i32>()?;
///
/// Ok(Point { x: x_fromstr, y: y_fromstr })
/// }
/// }
///
/// let p = Point::from_str("(1,2)");
/// assert_eq!(p.unwrap(), Point{ x: 1, y: 2} )
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
pub trait FromStr: Sized {
/// The associated error which can be returned from parsing.
#[stable(feature = "rust1", since = "1.0.0")]
type Err;
/// Parses a string `s` to return a value of this type.
///
/// If parsing succeeds, return the value inside [`Ok`], otherwise
/// when the string is ill-formatted return an error specific to the
/// inside [`Err`]. The error type is specific to implementation of the trait.
///
/// # Examples
///
/// Basic usage with [`i32`][ithirtytwo], a type that implements `FromStr`:
///
/// [ithirtytwo]: ../../std/primitive.i32.html
///
/// ```
/// use std::str::FromStr;
///
/// let s = "5";
/// let x = i32::from_str(s).unwrap();
///
/// assert_eq!(5, x);
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
fn from_str(s: &str) -> Result<Self, Self::Err>;
}
#[stable(feature = "rust1", since = "1.0.0")]
impl FromStr for bool {
type Err = ParseBoolError;
/// Parse a `bool` from a string.
///
/// Yields a `Result<bool, ParseBoolError>`, because `s` may or may not
/// actually be parseable.
///
/// # Examples
///
/// ```
/// use std::str::FromStr;
///
/// assert_eq!(FromStr::from_str("true"), Ok(true));
/// assert_eq!(FromStr::from_str("false"), Ok(false));
/// assert!(<bool as FromStr>::from_str("not even a boolean").is_err());
/// ```
///
/// Note, in many cases, the `.parse()` method on `str` is more proper.
///
/// ```
/// assert_eq!("true".parse(), Ok(true));
/// assert_eq!("false".parse(), Ok(false));
/// assert!("not even a boolean".parse::<bool>().is_err());
/// ```
#[inline]
fn from_str(s: &str) -> Result<bool, ParseBoolError> {
match s {
"true" => Ok(true),
"false" => Ok(false),
_ => Err(ParseBoolError { _priv: () }),
}
}
}

View File

@ -0,0 +1,275 @@
//! Operations related to UTF-8 validation.
use crate::mem;
use super::Utf8Error;
/// Returns the initial codepoint accumulator for the first byte.
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
/// for width 3, and 3 bits for width 4.
#[inline]
fn utf8_first_byte(byte: u8, width: u32) -> u32 {
(byte & (0x7F >> width)) as u32
}
/// Returns the value of `ch` updated with continuation byte `byte`.
#[inline]
fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
(ch << 6) | (byte & CONT_MASK) as u32
}
/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
/// bits `10`).
#[inline]
pub(super) fn utf8_is_cont_byte(byte: u8) -> bool {
(byte & !CONT_MASK) == TAG_CONT_U8
}
#[inline]
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
match opt {
Some(&byte) => byte,
None => 0,
}
}
/// Reads the next code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
#[unstable(feature = "str_internals", issue = "none")]
#[inline]
pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
// Decode UTF-8
let x = *bytes.next()?;
if x < 128 {
return Some(x as u32);
}
// Multibyte case follows
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte(x, 2);
let y = unwrap_or_0(bytes.next());
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
let z = unwrap_or_0(bytes.next());
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
let w = unwrap_or_0(bytes.next());
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
}
Some(ch)
}
/// Reads the last code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
#[inline]
pub(super) fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
where
I: DoubleEndedIterator<Item = &'a u8>,
{
// Decode UTF-8
let w = match *bytes.next_back()? {
next_byte if next_byte < 128 => return Some(next_byte as u32),
back_byte => back_byte,
};
// Multibyte case follows
// Decode from a byte combination out of: [x [y [z w]]]
let mut ch;
let z = unwrap_or_0(bytes.next_back());
ch = utf8_first_byte(z, 2);
if utf8_is_cont_byte(z) {
let y = unwrap_or_0(bytes.next_back());
ch = utf8_first_byte(y, 3);
if utf8_is_cont_byte(y) {
let x = unwrap_or_0(bytes.next_back());
ch = utf8_first_byte(x, 4);
ch = utf8_acc_cont_byte(ch, y);
}
ch = utf8_acc_cont_byte(ch, z);
}
ch = utf8_acc_cont_byte(ch, w);
Some(ch)
}
// use truncation to fit u64 into usize
const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
#[inline]
fn contains_nonascii(x: usize) -> bool {
(x & NONASCII_MASK) != 0
}
/// Walks through `v` checking that it's a valid UTF-8 sequence,
/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
#[inline(always)]
pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
let mut index = 0;
let len = v.len();
let usize_bytes = mem::size_of::<usize>();
let ascii_block_size = 2 * usize_bytes;
let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };
let align = v.as_ptr().align_offset(usize_bytes);
while index < len {
let old_offset = index;
macro_rules! err {
($error_len: expr) => {
return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len });
};
}
macro_rules! next {
() => {{
index += 1;
// we needed data, but there was none: error!
if index >= len {
err!(None)
}
v[index]
}};
}
let first = v[index];
if first >= 128 {
let w = UTF8_CHAR_WIDTH[first as usize];
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
// first C2 80 last DF BF
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
// first E0 A0 80 last EF BF BF
// excluding surrogates codepoints \u{d800} to \u{dfff}
// ED A0 80 to ED BF BF
// 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
// first F0 90 80 80 last F4 8F BF BF
//
// Use the UTF-8 syntax from the RFC
//
// https://tools.ietf.org/html/rfc3629
// UTF8-1 = %x00-7F
// UTF8-2 = %xC2-DF UTF8-tail
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
// %xF4 %x80-8F 2( UTF8-tail )
match w {
2 => {
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(1))
}
}
3 => {
match (first, next!()) {
(0xE0, 0xA0..=0xBF)
| (0xE1..=0xEC, 0x80..=0xBF)
| (0xED, 0x80..=0x9F)
| (0xEE..=0xEF, 0x80..=0xBF) => {}
_ => err!(Some(1)),
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
}
4 => {
match (first, next!()) {
(0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
_ => err!(Some(1)),
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(3))
}
}
_ => err!(Some(1)),
}
index += 1;
} else {
// Ascii case, try to skip forward quickly.
// When the pointer is aligned, read 2 words of data per iteration
// until we find a word containing a non-ascii byte.
if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 {
let ptr = v.as_ptr();
while index < blocks_end {
// SAFETY: since `align - index` and `ascii_block_size` are
// multiples of `usize_bytes`, `block = ptr.add(index)` is
// always aligned with a `usize` so it's safe to dereference
// both `block` and `block.offset(1)`.
unsafe {
let block = ptr.add(index) as *const usize;
// break if there is a nonascii byte
let zu = contains_nonascii(*block);
let zv = contains_nonascii(*block.offset(1));
if zu | zv {
break;
}
}
index += ascii_block_size;
}
// step from the point where the wordwise loop stopped
while index < len && v[index] < 128 {
index += 1;
}
} else {
index += 1;
}
}
}
Ok(())
}
// https://tools.ietf.org/html/rfc3629
static UTF8_CHAR_WIDTH: [u8; 256] = [
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, // 0x1F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, // 0x3F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, // 0x5F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, // 0x7F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, // 0x9F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, // 0xBF
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, // 0xDF
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF
];
/// Given a first byte, determines how many bytes are in this UTF-8 character.
#[unstable(feature = "str_internals", issue = "none")]
#[inline]
pub fn utf8_char_width(b: u8) -> usize {
UTF8_CHAR_WIDTH[b as usize] as usize
}
/// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111;
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
const TAG_CONT_U8: u8 = 0b1000_0000;
// truncate `&str` to length at most equal to `max`
// return `true` if it were truncated, and the new str.
pub(super) fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) {
if max >= s.len() {
(false, s)
} else {
while !s.is_char_boundary(max) {
max -= 1;
}
(true, &s[..max])
}
}