rust/library/core/src/char/methods.rs

1664 lines
53 KiB
Rust
Raw Normal View History

//! impl char {}
2021-02-14 12:34:22 +01:00
use crate::intrinsics::likely;
2019-04-15 04:23:21 +02:00
use crate::slice;
use crate::str::from_utf8_unchecked_mut;
use crate::unicode::printable::is_printable;
use crate::unicode::{self, conversions};
2019-04-15 04:23:21 +02:00
use super::*;
Add libunicode; move unicode functions from core - created new crate, libunicode, below libstd - split Char trait into Char (libcore) and UnicodeChar (libunicode) - Unicode-aware functions now live in libunicode - is_alphabetic, is_XID_start, is_XID_continue, is_lowercase, is_uppercase, is_whitespace, is_alphanumeric, is_control, is_digit, to_uppercase, to_lowercase - added width method in UnicodeChar trait - determines printed width of character in columns, or None if it is a non-NULL control character - takes a boolean argument indicating whether the present context is CJK or not (characters with 'A'mbiguous widths are double-wide in CJK contexts, single-wide otherwise) - split StrSlice into StrSlice (libcore) and UnicodeStrSlice (libunicode) - functionality formerly in StrSlice that relied upon Unicode functionality from Char is now in UnicodeStrSlice - words, is_whitespace, is_alphanumeric, trim, trim_left, trim_right - also moved Words type alias into libunicode because words method is in UnicodeStrSlice - unified Unicode tables from libcollections, libcore, and libregex into libunicode - updated unicode.py in src/etc to generate aforementioned tables - generated new tables based on latest Unicode data - added UnicodeChar and UnicodeStrSlice traits to prelude - libunicode is now the collection point for the std::char module, combining the libunicode functionality with the Char functionality from libcore - thus, moved doc comment for char from core::char to unicode::char - libcollections remains the collection point for std::str The Unicode-aware functions that previously lived in the Char and StrSlice traits are no longer available to programs that only use libcore. To regain use of these methods, include the libunicode crate and use the UnicodeChar and/or UnicodeStrSlice traits: extern crate unicode; use unicode::UnicodeChar; use unicode::UnicodeStrSlice; use unicode::Words; // if you want to use the words() method NOTE: this does *not* impact programs that use libstd, since UnicodeChar and UnicodeStrSlice have been added to the prelude. closes #15224 [breaking-change]
2014-06-30 23:04:10 +02:00
2015-03-11 05:13:16 +01:00
#[lang = "char"]
impl char {
/// The highest valid code point a `char` can have.
///
/// A `char` is a [Unicode Scalar Value], which means that it is a [Code
/// Point], but only ones within a certain range. `MAX` is the highest valid
/// code point that's a valid [Unicode Scalar Value].
///
/// [Unicode Scalar Value]: http://www.unicode.org/glossary/#unicode_scalar_value
/// [Code Point]: http://www.unicode.org/glossary/#code_point
#[stable(feature = "assoc_char_consts", since = "1.52.0")]
pub const MAX: char = '\u{10ffff}';
/// `U+FFFD REPLACEMENT CHARACTER` (<28>) is used in Unicode to represent a
/// decoding error.
///
/// It can occur, for example, when giving ill-formed UTF-8 bytes to
/// [`String::from_utf8_lossy`](string/struct.String.html#method.from_utf8_lossy).
#[stable(feature = "assoc_char_consts", since = "1.52.0")]
pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}';
/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
/// `char` and `str` methods are based on.
///
/// New versions of Unicode are released regularly and subsequently all methods
/// in the standard library depending on Unicode are updated. Therefore the
/// behavior of some `char` and `str` methods and the value of this constant
/// changes over time. This is *not* considered to be a breaking change.
///
/// The version numbering scheme is explained in
/// [Unicode 11.0 or later, Section 3.1 Versions of the Unicode Standard](https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf#page=4).
#[stable(feature = "assoc_char_consts", since = "1.52.0")]
pub const UNICODE_VERSION: (u8, u8, u8) = crate::unicode::UNICODE_VERSION;
/// Creates an iterator over the UTF-16 encoded code points in `iter`,
/// returning unpaired surrogates as `Err`s.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::char::decode_utf16;
///
/// // 𝄞mus<invalid>ic<invalid>
/// let v = [
/// 0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0xDD1E, 0x0069, 0x0063, 0xD834,
/// ];
///
/// assert_eq!(
/// decode_utf16(v.iter().cloned())
/// .map(|r| r.map_err(|e| e.unpaired_surrogate()))
/// .collect::<Vec<_>>(),
/// vec![
/// Ok('𝄞'),
/// Ok('m'), Ok('u'), Ok('s'),
/// Err(0xDD1E),
/// Ok('i'), Ok('c'),
/// Err(0xD834)
/// ]
/// );
/// ```
///
/// A lossy decoder can be obtained by replacing `Err` results with the replacement character:
///
/// ```
/// use std::char::{decode_utf16, REPLACEMENT_CHARACTER};
///
/// // 𝄞mus<invalid>ic<invalid>
/// let v = [
/// 0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0xDD1E, 0x0069, 0x0063, 0xD834,
/// ];
///
/// assert_eq!(
/// decode_utf16(v.iter().cloned())
/// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
/// .collect::<String>(),
/// "𝄞mus<75>ic<69>"
/// );
/// ```
#[stable(feature = "assoc_char_funcs", since = "1.52.0")]
#[inline]
pub fn decode_utf16<I: IntoIterator<Item = u16>>(iter: I) -> DecodeUtf16<I::IntoIter> {
super::decode::decode_utf16(iter)
}
/// Converts a `u32` to a `char`.
///
/// Note that all `char`s are valid [`u32`]s, and can be cast to one with
/// `as`:
///
/// ```
/// let c = '💯';
/// let i = c as u32;
///
/// assert_eq!(128175, i);
/// ```
///
/// However, the reverse is not true: not all valid [`u32`]s are valid
/// `char`s. `from_u32()` will return `None` if the input is not a valid value
/// for a `char`.
///
/// For an unsafe version of this function which ignores these checks, see
/// [`from_u32_unchecked`].
///
2020-11-19 14:55:57 +01:00
/// [`from_u32_unchecked`]: #method.from_u32_unchecked
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::char;
///
/// let c = char::from_u32(0x2764);
///
/// assert_eq!(Some('❤'), c);
/// ```
///
/// Returning `None` when the input is not a valid `char`:
///
/// ```
/// use std::char;
///
/// let c = char::from_u32(0x110000);
///
/// assert_eq!(None, c);
/// ```
#[stable(feature = "assoc_char_funcs", since = "1.52.0")]
#[inline]
pub fn from_u32(i: u32) -> Option<char> {
super::convert::from_u32(i)
}
/// Converts a `u32` to a `char`, ignoring validity.
///
/// Note that all `char`s are valid [`u32`]s, and can be cast to one with
/// `as`:
///
/// ```
/// let c = '💯';
/// let i = c as u32;
///
/// assert_eq!(128175, i);
/// ```
///
/// However, the reverse is not true: not all valid [`u32`]s are valid
/// `char`s. `from_u32_unchecked()` will ignore this, and blindly cast to
/// `char`, possibly creating an invalid one.
///
/// # Safety
///
/// This function is unsafe, as it may construct invalid `char` values.
///
/// For a safe version of this function, see the [`from_u32`] function.
///
2020-11-19 14:55:57 +01:00
/// [`from_u32`]: #method.from_u32
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::char;
///
/// let c = unsafe { char::from_u32_unchecked(0x2764) };
///
/// assert_eq!('❤', c);
/// ```
#[stable(feature = "assoc_char_funcs", since = "1.52.0")]
#[inline]
pub unsafe fn from_u32_unchecked(i: u32) -> char {
2020-06-22 00:54:46 +02:00
// SAFETY: the safety contract must be upheld by the caller.
unsafe { super::convert::from_u32_unchecked(i) }
}
/// Converts a digit in the given radix to a `char`.
///
/// A 'radix' here is sometimes also called a 'base'. A radix of two
/// indicates a binary number, a radix of ten, decimal, and a radix of
/// sixteen, hexadecimal, to give some common values. Arbitrary
/// radices are supported.
///
/// `from_digit()` will return `None` if the input is not a digit in
/// the given radix.
///
/// # Panics
///
/// Panics if given a radix larger than 36.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::char;
///
/// let c = char::from_digit(4, 10);
///
/// assert_eq!(Some('4'), c);
///
/// // Decimal 11 is a single digit in base 16
/// let c = char::from_digit(11, 16);
///
/// assert_eq!(Some('b'), c);
/// ```
///
/// Returning `None` when the input is not a digit:
///
/// ```
/// use std::char;
///
/// let c = char::from_digit(20, 10);
///
/// assert_eq!(None, c);
/// ```
///
/// Passing a large radix, causing a panic:
///
/// ```should_panic
/// use std::char;
///
/// // this panics
/// char::from_digit(1, 37);
/// ```
#[stable(feature = "assoc_char_funcs", since = "1.52.0")]
#[inline]
pub fn from_digit(num: u32, radix: u32) -> Option<char> {
super::convert::from_digit(num, radix)
}
/// Checks if a `char` is a digit in the given radix.
///
/// A 'radix' here is sometimes also called a 'base'. A radix of two
/// indicates a binary number, a radix of ten, decimal, and a radix of
2016-01-05 16:40:40 +01:00
/// sixteen, hexadecimal, to give some common values. Arbitrary
2016-11-23 14:49:35 +01:00
/// radices are supported.
2015-03-11 05:13:16 +01:00
///
2020-10-13 00:18:41 +02:00
/// Compared to [`is_numeric()`], this function only recognizes the characters
2015-03-11 05:13:16 +01:00
/// `0-9`, `a-z` and `A-Z`.
///
/// 'Digit' is defined to be only the following characters:
2015-03-11 05:13:16 +01:00
///
/// * `0-9`
/// * `a-z`
/// * `A-Z`
///
2020-10-13 00:18:41 +02:00
/// For a more comprehensive understanding of 'digit', see [`is_numeric()`].
///
2020-11-19 14:55:57 +01:00
/// [`is_numeric()`]: #method.is_numeric
2015-03-11 05:13:16 +01:00
///
/// # Panics
///
/// Panics if given a radix larger than 36.
2015-03-11 05:13:16 +01:00
///
/// # Examples
///
/// Basic usage:
///
2015-03-11 05:13:16 +01:00
/// ```
2016-03-22 22:09:43 +01:00
/// assert!('1'.is_digit(10));
/// assert!('f'.is_digit(16));
/// assert!(!'f'.is_digit(10));
/// ```
///
/// Passing a large radix, causing a panic:
///
/// ```should_panic
/// // this panics
/// '1'.is_digit(37);
2015-03-11 05:13:16 +01:00
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2015-10-25 11:19:14 +01:00
pub fn is_digit(self, radix: u32) -> bool {
self.to_digit(radix).is_some()
2015-10-25 11:19:14 +01:00
}
2015-03-11 05:13:16 +01:00
/// Converts a `char` to a digit in the given radix.
///
/// A 'radix' here is sometimes also called a 'base'. A radix of two
/// indicates a binary number, a radix of ten, decimal, and a radix of
2016-01-05 16:40:40 +01:00
/// sixteen, hexadecimal, to give some common values. Arbitrary
2016-11-23 14:49:35 +01:00
/// radices are supported.
///
/// 'Digit' is defined to be only the following characters:
2015-03-11 05:13:16 +01:00
///
/// * `0-9`
/// * `a-z`
/// * `A-Z`
2015-03-11 05:13:16 +01:00
///
/// # Errors
///
/// Returns `None` if the `char` does not refer to a digit in the given radix.
2015-03-11 05:13:16 +01:00
///
/// # Panics
///
/// Panics if given a radix larger than 36.
2015-03-11 05:13:16 +01:00
///
/// # Examples
///
/// Basic usage:
///
/// ```
2016-03-22 22:09:43 +01:00
/// assert_eq!('1'.to_digit(10), Some(1));
/// assert_eq!('f'.to_digit(16), Some(15));
/// ```
///
/// Passing a non-digit results in failure:
///
2015-03-11 05:13:16 +01:00
/// ```
2016-03-22 22:09:43 +01:00
/// assert_eq!('f'.to_digit(10), None);
/// assert_eq!('z'.to_digit(16), None);
/// ```
///
/// Passing a large radix, causing a panic:
///
/// ```should_panic
/// // this panics
/// '1'.to_digit(37);
2015-03-11 05:13:16 +01:00
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2015-10-25 11:19:14 +01:00
pub fn to_digit(self, radix: u32) -> Option<u32> {
2021-02-14 12:34:22 +01:00
assert!(radix <= 36, "to_digit: radix is too high (maximum 36)");
2018-11-14 11:26:00 +01:00
// the code is split up here to improve execution speed for cases where
// the `radix` is constant and 10 or smaller
2021-02-14 12:34:22 +01:00
let val = if likely(radix <= 10) {
// If not a digit, a number greater than radix will be created.
(self as u32).wrapping_sub('0' as u32)
core/char: Speed up `to_digit()` for `radix <= 10` ### Before ``` # Run 1 test char::methods::bench_to_digit_radix_10 ... bench: 16,265 ns/iter (+/- 1,774) test char::methods::bench_to_digit_radix_16 ... bench: 13,938 ns/iter (+/- 2,479) test char::methods::bench_to_digit_radix_2 ... bench: 13,090 ns/iter (+/- 524) test char::methods::bench_to_digit_radix_36 ... bench: 14,236 ns/iter (+/- 1,949) # Run 2 test char::methods::bench_to_digit_radix_10 ... bench: 16,176 ns/iter (+/- 1,589) test char::methods::bench_to_digit_radix_16 ... bench: 13,896 ns/iter (+/- 3,140) test char::methods::bench_to_digit_radix_2 ... bench: 13,158 ns/iter (+/- 1,112) test char::methods::bench_to_digit_radix_36 ... bench: 14,206 ns/iter (+/- 1,312) # Run 3 test char::methods::bench_to_digit_radix_10 ... bench: 16,221 ns/iter (+/- 2,423) test char::methods::bench_to_digit_radix_16 ... bench: 14,361 ns/iter (+/- 3,926) test char::methods::bench_to_digit_radix_2 ... bench: 13,097 ns/iter (+/- 671) test char::methods::bench_to_digit_radix_36 ... bench: 14,388 ns/iter (+/- 1,068) ``` ### After ``` # Run 1 test char::methods::bench_to_digit_radix_10 ... bench: 11,521 ns/iter (+/- 552) test char::methods::bench_to_digit_radix_16 ... bench: 12,926 ns/iter (+/- 684) test char::methods::bench_to_digit_radix_2 ... bench: 11,266 ns/iter (+/- 1,085) test char::methods::bench_to_digit_radix_36 ... bench: 14,213 ns/iter (+/- 614) # Run 2 test char::methods::bench_to_digit_radix_10 ... bench: 11,424 ns/iter (+/- 1,042) test char::methods::bench_to_digit_radix_16 ... bench: 12,854 ns/iter (+/- 1,193) test char::methods::bench_to_digit_radix_2 ... bench: 11,193 ns/iter (+/- 716) test char::methods::bench_to_digit_radix_36 ... bench: 14,249 ns/iter (+/- 3,514) # Run 3 test char::methods::bench_to_digit_radix_10 ... bench: 11,469 ns/iter (+/- 685) test char::methods::bench_to_digit_radix_16 ... bench: 12,852 ns/iter (+/- 568) test char::methods::bench_to_digit_radix_2 ... bench: 11,275 ns/iter (+/- 1,356) test char::methods::bench_to_digit_radix_36 ... bench: 14,188 ns/iter (+/- 1,501) ```
2018-11-13 18:05:46 +01:00
} else {
match self {
'0'..='9' => self as u32 - '0' as u32,
'a'..='z' => self as u32 - 'a' as u32 + 10,
'A'..='Z' => self as u32 - 'A' as u32 + 10,
_ => return None,
}
};
core/char: Speed up `to_digit()` for `radix <= 10` ### Before ``` # Run 1 test char::methods::bench_to_digit_radix_10 ... bench: 16,265 ns/iter (+/- 1,774) test char::methods::bench_to_digit_radix_16 ... bench: 13,938 ns/iter (+/- 2,479) test char::methods::bench_to_digit_radix_2 ... bench: 13,090 ns/iter (+/- 524) test char::methods::bench_to_digit_radix_36 ... bench: 14,236 ns/iter (+/- 1,949) # Run 2 test char::methods::bench_to_digit_radix_10 ... bench: 16,176 ns/iter (+/- 1,589) test char::methods::bench_to_digit_radix_16 ... bench: 13,896 ns/iter (+/- 3,140) test char::methods::bench_to_digit_radix_2 ... bench: 13,158 ns/iter (+/- 1,112) test char::methods::bench_to_digit_radix_36 ... bench: 14,206 ns/iter (+/- 1,312) # Run 3 test char::methods::bench_to_digit_radix_10 ... bench: 16,221 ns/iter (+/- 2,423) test char::methods::bench_to_digit_radix_16 ... bench: 14,361 ns/iter (+/- 3,926) test char::methods::bench_to_digit_radix_2 ... bench: 13,097 ns/iter (+/- 671) test char::methods::bench_to_digit_radix_36 ... bench: 14,388 ns/iter (+/- 1,068) ``` ### After ``` # Run 1 test char::methods::bench_to_digit_radix_10 ... bench: 11,521 ns/iter (+/- 552) test char::methods::bench_to_digit_radix_16 ... bench: 12,926 ns/iter (+/- 684) test char::methods::bench_to_digit_radix_2 ... bench: 11,266 ns/iter (+/- 1,085) test char::methods::bench_to_digit_radix_36 ... bench: 14,213 ns/iter (+/- 614) # Run 2 test char::methods::bench_to_digit_radix_10 ... bench: 11,424 ns/iter (+/- 1,042) test char::methods::bench_to_digit_radix_16 ... bench: 12,854 ns/iter (+/- 1,193) test char::methods::bench_to_digit_radix_2 ... bench: 11,193 ns/iter (+/- 716) test char::methods::bench_to_digit_radix_36 ... bench: 14,249 ns/iter (+/- 3,514) # Run 3 test char::methods::bench_to_digit_radix_10 ... bench: 11,469 ns/iter (+/- 685) test char::methods::bench_to_digit_radix_16 ... bench: 12,852 ns/iter (+/- 568) test char::methods::bench_to_digit_radix_2 ... bench: 11,275 ns/iter (+/- 1,356) test char::methods::bench_to_digit_radix_36 ... bench: 14,188 ns/iter (+/- 1,501) ```
2018-11-13 18:05:46 +01:00
if val < radix { Some(val) } else { None }
2015-10-25 11:19:14 +01:00
}
2015-03-11 05:13:16 +01:00
/// Returns an iterator that yields the hexadecimal Unicode escape of a
/// character as `char`s.
2015-03-11 05:13:16 +01:00
///
/// This will escape characters with the Rust syntax of the form
/// `\u{NNNNNN}` where `NNNNNN` is a hexadecimal representation.
2015-03-11 05:13:16 +01:00
///
/// # Examples
///
/// As an iterator:
///
2015-03-11 05:13:16 +01:00
/// ```
/// for c in '❤'.escape_unicode() {
/// print!("{}", c);
2015-03-11 05:13:16 +01:00
/// }
/// println!();
2015-03-11 05:13:16 +01:00
/// ```
///
/// Using `println!` directly:
2015-03-11 05:13:16 +01:00
///
/// ```
/// println!("{}", '❤'.escape_unicode());
2015-03-11 05:13:16 +01:00
/// ```
///
/// Both are equivalent to:
2015-03-11 05:13:16 +01:00
///
/// ```
/// println!("\\u{{2764}}");
/// ```
2015-03-11 05:13:16 +01:00
///
/// Using `to_string`:
///
/// ```
/// assert_eq!('❤'.escape_unicode().to_string(), "\\u{2764}");
2015-03-11 05:13:16 +01:00
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2021-02-14 00:16:48 +01:00
pub fn escape_unicode(self) -> EscapeUnicode {
let c = self as u32;
// or-ing 1 ensures that for c==0 the code computes that one
// digit should be printed and (which is the same) avoids the
// (31 - 32) underflow
let msb = 31 - (c | 1).leading_zeros();
// the index of the most significant hex digit
let ms_hex_digit = msb / 4;
EscapeUnicode {
c: self,
state: EscapeUnicodeState::Backslash,
hex_digit_idx: ms_hex_digit as usize,
}
2015-10-25 11:19:14 +01:00
}
2015-03-11 05:13:16 +01:00
/// An extended version of `escape_debug` that optionally permits escaping
/// Extended Grapheme codepoints. This allows us to format characters like
/// nonspacing marks better when they're at the start of a string.
#[inline]
pub(crate) fn escape_debug_ext(self, escape_grapheme_extended: bool) -> EscapeDebug {
let init_state = match self {
'\t' => EscapeDefaultState::Backslash('t'),
'\r' => EscapeDefaultState::Backslash('r'),
'\n' => EscapeDefaultState::Backslash('n'),
'\\' | '\'' | '"' => EscapeDefaultState::Backslash(self),
_ if escape_grapheme_extended && self.is_grapheme_extended() => {
EscapeDefaultState::Unicode(self.escape_unicode())
}
_ if is_printable(self) => EscapeDefaultState::Char(self),
_ => EscapeDefaultState::Unicode(self.escape_unicode()),
};
EscapeDebug(EscapeDefault { state: init_state })
}
/// Returns an iterator that yields the literal escape code of a character
/// as `char`s.
///
/// This will escape the characters similar to the `Debug` implementations
/// of `str` or `char`.
///
/// # Examples
///
/// As an iterator:
///
/// ```
/// for c in '\n'.escape_debug() {
/// print!("{}", c);
/// }
/// println!();
/// ```
///
/// Using `println!` directly:
///
/// ```
/// println!("{}", '\n'.escape_debug());
/// ```
///
/// Both are equivalent to:
///
/// ```
/// println!("\\n");
/// ```
///
/// Using `to_string`:
///
/// ```
/// assert_eq!('\n'.escape_debug().to_string(), "\\n");
/// ```
#[stable(feature = "char_escape_debug", since = "1.20.0")]
#[inline]
pub fn escape_debug(self) -> EscapeDebug {
self.escape_debug_ext(true)
}
/// Returns an iterator that yields the literal escape code of a character
/// as `char`s.
2015-03-11 05:13:16 +01:00
///
/// The default is chosen with a bias toward producing literals that are
/// legal in a variety of languages, including C++11 and similar C-family
/// languages. The exact rules are:
///
/// * Tab is escaped as `\t`.
/// * Carriage return is escaped as `\r`.
/// * Line feed is escaped as `\n`.
/// * Single quote is escaped as `\'`.
/// * Double quote is escaped as `\"`.
/// * Backslash is escaped as `\\`.
/// * Any character in the 'printable ASCII' range `0x20` .. `0x7e`
/// inclusive is not escaped.
/// * All other characters are given hexadecimal Unicode escapes; see
2020-10-13 00:18:41 +02:00
/// [`escape_unicode`].
///
2020-11-19 14:55:57 +01:00
/// [`escape_unicode`]: #method.escape_unicode
2015-03-11 05:13:16 +01:00
///
/// # Examples
///
/// As an iterator:
///
2015-03-11 05:13:16 +01:00
/// ```
/// for c in '"'.escape_default() {
/// print!("{}", c);
2015-03-11 05:13:16 +01:00
/// }
/// println!();
2015-03-11 05:13:16 +01:00
/// ```
///
/// Using `println!` directly:
2015-03-11 05:13:16 +01:00
///
/// ```
/// println!("{}", '"'.escape_default());
/// ```
///
/// Both are equivalent to:
2015-03-11 05:13:16 +01:00
///
/// ```
/// println!("\\\"");
/// ```
///
/// Using `to_string`:
2015-03-11 05:13:16 +01:00
///
/// ```
/// assert_eq!('"'.escape_default().to_string(), "\\\"");
2015-03-11 05:13:16 +01:00
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2021-02-14 00:16:48 +01:00
pub fn escape_default(self) -> EscapeDefault {
let init_state = match self {
'\t' => EscapeDefaultState::Backslash('t'),
'\r' => EscapeDefaultState::Backslash('r'),
'\n' => EscapeDefaultState::Backslash('n'),
'\\' | '\'' | '"' => EscapeDefaultState::Backslash(self),
'\x20'..='\x7e' => EscapeDefaultState::Char(self),
_ => EscapeDefaultState::Unicode(self.escape_unicode()),
};
EscapeDefault { state: init_state }
2015-10-25 11:19:14 +01:00
}
2015-03-11 05:13:16 +01:00
/// Returns the number of bytes this `char` would need if encoded in UTF-8.
///
/// That number of bytes is always between 1 and 4, inclusive.
2015-03-11 05:13:16 +01:00
///
/// # Examples
///
/// Basic usage:
///
2015-03-11 05:13:16 +01:00
/// ```
/// let len = 'A'.len_utf8();
/// assert_eq!(len, 1);
///
/// let len = 'ß'.len_utf8();
/// assert_eq!(len, 2);
2015-03-11 05:13:16 +01:00
///
/// let len = ''.len_utf8();
/// assert_eq!(len, 3);
///
/// let len = '💣'.len_utf8();
/// assert_eq!(len, 4);
/// ```
///
/// The `&str` type guarantees that its contents are UTF-8, and so we can compare the length it
/// would take if each code point was represented as a `char` vs in the `&str` itself:
///
/// ```
/// // as chars
/// let eastern = '東';
/// let capital = '京';
///
/// // both can be represented as three bytes
/// assert_eq!(3, eastern.len_utf8());
2019-06-29 19:46:45 +02:00
/// assert_eq!(3, capital.len_utf8());
///
/// // as a &str, these two are encoded in UTF-8
/// let tokyo = "東京";
///
2019-06-29 19:46:45 +02:00
/// let len = eastern.len_utf8() + capital.len_utf8();
///
/// // we can see that they take six bytes total...
/// assert_eq!(6, tokyo.len());
///
/// // ... just like the &str
/// assert_eq!(len, tokyo.len());
2015-03-11 05:13:16 +01:00
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[rustc_const_stable(feature = "const_char_len_utf", since = "1.52.0")]
#[inline]
pub const fn len_utf8(self) -> usize {
len_utf8(self as u32)
2015-10-25 11:19:14 +01:00
}
2015-03-11 05:13:16 +01:00
/// Returns the number of 16-bit code units this `char` would need if
2015-03-11 05:13:16 +01:00
/// encoded in UTF-16.
///
2020-10-13 00:18:41 +02:00
/// See the documentation for [`len_utf8()`] for more explanation of this
2015-12-09 21:58:49 +01:00
/// concept. This function is a mirror, but for UTF-16 instead of UTF-8.
///
2020-11-19 14:55:57 +01:00
/// [`len_utf8()`]: #method.len_utf8
///
2015-03-11 05:13:16 +01:00
/// # Examples
///
/// Basic usage:
///
2015-03-11 05:13:16 +01:00
/// ```
/// let n = 'ß'.len_utf16();
/// assert_eq!(n, 1);
///
/// let len = '💣'.len_utf16();
/// assert_eq!(len, 2);
2015-03-11 05:13:16 +01:00
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[rustc_const_stable(feature = "const_char_len_utf", since = "1.52.0")]
#[inline]
pub const fn len_utf16(self) -> usize {
let ch = self as u32;
if (ch & 0xFFFF) == ch { 1 } else { 2 }
2015-10-25 11:19:14 +01:00
}
2015-03-11 05:13:16 +01:00
/// Encodes this character as UTF-8 into the provided byte buffer,
/// and then returns the subslice of the buffer that contains the encoded character.
2015-03-11 05:13:16 +01:00
///
/// # Panics
///
/// Panics if the buffer is not large enough.
/// A buffer of length four is large enough to encode any `char`.
2015-03-11 05:13:16 +01:00
///
/// # Examples
///
/// In both of these examples, 'ß' takes two bytes to encode.
///
2015-03-11 05:13:16 +01:00
/// ```
/// let mut b = [0; 2];
2015-03-11 05:13:16 +01:00
///
/// let result = 'ß'.encode_utf8(&mut b);
///
/// assert_eq!(result, "ß");
///
/// assert_eq!(result.len(), 2);
/// ```
///
/// A buffer that's too small:
///
/// ```should_panic
/// let mut b = [0; 1];
///
/// // this panics
/// 'ß'.encode_utf8(&mut b);
2015-03-11 05:13:16 +01:00
/// ```
#[stable(feature = "unicode_encode_char", since = "1.15.0")]
#[inline]
pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
// SAFETY: `char` is not a surrogate, so this is valid UTF-8.
unsafe { from_utf8_unchecked_mut(encode_utf8_raw(self as u32, dst)) }
}
2015-03-11 05:13:16 +01:00
/// Encodes this character as UTF-16 into the provided `u16` buffer,
/// and then returns the subslice of the buffer that contains the encoded character.
2015-03-11 05:13:16 +01:00
///
/// # Panics
///
/// Panics if the buffer is not large enough.
/// A buffer of length 2 is large enough to encode any `char`.
2015-03-11 05:13:16 +01:00
///
/// # Examples
///
/// In both of these examples, '𝕊' takes two `u16`s to encode.
///
2015-03-11 05:13:16 +01:00
/// ```
/// let mut b = [0; 2];
2015-03-11 05:13:16 +01:00
///
/// let result = '𝕊'.encode_utf16(&mut b);
///
/// assert_eq!(result.len(), 2);
/// ```
///
/// A buffer that's too small:
///
/// ```should_panic
/// let mut b = [0; 1];
///
/// // this panics
/// '𝕊'.encode_utf16(&mut b);
2015-03-11 05:13:16 +01:00
/// ```
#[stable(feature = "unicode_encode_char", since = "1.15.0")]
#[inline]
pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
encode_utf16_raw(self as u32, dst)
}
2015-03-11 05:13:16 +01:00
/// Returns `true` if this `char` has the `Alphabetic` property.
///
/// `Alphabetic` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
///
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
///
/// # Examples
///
/// Basic usage:
///
/// ```
2016-03-22 22:09:43 +01:00
/// assert!('a'.is_alphabetic());
/// assert!('京'.is_alphabetic());
///
/// let c = '💝';
/// // love is many things, but it is not alphabetic
/// assert!(!c.is_alphabetic());
/// ```
2015-03-11 05:13:16 +01:00
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2015-03-11 05:13:16 +01:00
pub fn is_alphabetic(self) -> bool {
match self {
'a'..='z' | 'A'..='Z' => true,
c => c > '\x7f' && unicode::Alphabetic(c),
2015-03-11 05:13:16 +01:00
}
}
/// Returns `true` if this `char` has the `Lowercase` property.
2015-03-11 05:13:16 +01:00
///
/// `Lowercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
///
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
///
/// # Examples
///
/// Basic usage:
///
/// ```
2016-03-22 22:09:43 +01:00
/// assert!('a'.is_lowercase());
/// assert!('δ'.is_lowercase());
/// assert!(!'A'.is_lowercase());
/// assert!(!'Δ'.is_lowercase());
///
/// // The various Chinese scripts and punctuation do not have case, and so:
2016-03-22 22:09:43 +01:00
/// assert!(!'中'.is_lowercase());
/// assert!(!' '.is_lowercase());
/// ```
2015-03-11 05:13:16 +01:00
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2015-03-11 05:13:16 +01:00
pub fn is_lowercase(self) -> bool {
match self {
'a'..='z' => true,
c => c > '\x7f' && unicode::Lowercase(c),
2015-03-11 05:13:16 +01:00
}
}
/// Returns `true` if this `char` has the `Uppercase` property.
///
/// `Uppercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
2015-03-11 05:13:16 +01:00
///
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
///
/// # Examples
///
/// Basic usage:
///
/// ```
2016-03-22 22:09:43 +01:00
/// assert!(!'a'.is_uppercase());
/// assert!(!'δ'.is_uppercase());
/// assert!('A'.is_uppercase());
/// assert!('Δ'.is_uppercase());
///
/// // The various Chinese scripts and punctuation do not have case, and so:
2016-03-22 22:09:43 +01:00
/// assert!(!'中'.is_uppercase());
/// assert!(!' '.is_uppercase());
/// ```
2015-03-11 05:13:16 +01:00
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2015-03-11 05:13:16 +01:00
pub fn is_uppercase(self) -> bool {
match self {
'A'..='Z' => true,
c => c > '\x7f' && unicode::Uppercase(c),
2015-03-11 05:13:16 +01:00
}
}
/// Returns `true` if this `char` has the `White_Space` property.
2015-03-11 05:13:16 +01:00
///
/// `White_Space` is specified in the [Unicode Character Database][ucd] [`PropList.txt`].
///
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`PropList.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
///
/// # Examples
///
/// Basic usage:
///
/// ```
2016-03-22 22:09:43 +01:00
/// assert!(' '.is_whitespace());
///
/// // a non-breaking space
2016-03-22 22:09:43 +01:00
/// assert!('\u{A0}'.is_whitespace());
///
2016-03-22 22:09:43 +01:00
/// assert!(!'越'.is_whitespace());
/// ```
2015-03-11 05:13:16 +01:00
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2015-03-11 05:13:16 +01:00
pub fn is_whitespace(self) -> bool {
match self {
' ' | '\x09'..='\x0d' => true,
c => c > '\x7f' && unicode::White_Space(c),
2015-03-11 05:13:16 +01:00
}
}
/// Returns `true` if this `char` satisfies either [`is_alphabetic()`] or [`is_numeric()`].
2015-03-11 05:13:16 +01:00
///
2020-11-19 14:55:57 +01:00
/// [`is_alphabetic()`]: #method.is_alphabetic
/// [`is_numeric()`]: #method.is_numeric
///
/// # Examples
///
/// Basic usage:
///
/// ```
2016-03-22 22:09:43 +01:00
/// assert!('٣'.is_alphanumeric());
/// assert!('7'.is_alphanumeric());
/// assert!('৬'.is_alphanumeric());
2018-06-17 18:58:01 +02:00
/// assert!('¾'.is_alphanumeric());
/// assert!('①'.is_alphanumeric());
2016-03-22 22:09:43 +01:00
/// assert!('K'.is_alphanumeric());
/// assert!('و'.is_alphanumeric());
/// assert!('藏'.is_alphanumeric());
/// ```
2015-03-11 05:13:16 +01:00
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2015-03-11 05:13:16 +01:00
pub fn is_alphanumeric(self) -> bool {
self.is_alphabetic() || self.is_numeric()
}
/// Returns `true` if this `char` has the general category for control codes.
///
/// Control codes (code points with the general category of `Cc`) are described in Chapter 4
/// (Character Properties) of the [Unicode Standard] and specified in the [Unicode Character
/// Database][ucd] [`UnicodeData.txt`].
2015-03-11 05:13:16 +01:00
///
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// // U+009C, STRING TERMINATOR
2016-03-22 22:09:43 +01:00
/// assert!('œ'.is_control());
/// assert!(!'q'.is_control());
/// ```
2015-03-11 05:13:16 +01:00
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2015-10-25 11:19:14 +01:00
pub fn is_control(self) -> bool {
unicode::Cc(self)
2015-10-25 11:19:14 +01:00
}
2015-03-11 05:13:16 +01:00
/// Returns `true` if this `char` has the `Grapheme_Extend` property.
///
/// `Grapheme_Extend` is described in [Unicode Standard Annex #29 (Unicode Text
/// Segmentation)][uax29] and specified in the [Unicode Character Database][ucd]
/// [`DerivedCoreProperties.txt`].
///
/// [uax29]: https://www.unicode.org/reports/tr29/
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
#[inline]
2018-05-17 00:19:58 +02:00
pub(crate) fn is_grapheme_extended(self) -> bool {
unicode::Grapheme_Extend(self)
}
/// Returns `true` if this `char` has one of the general categories for numbers.
///
/// The general categories for numbers (`Nd` for decimal digits, `Nl` for letter-like numeric
/// characters, and `No` for other numeric characters) are specified in the [Unicode Character
/// Database][ucd] [`UnicodeData.txt`].
///
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
///
/// # Examples
///
/// Basic usage:
///
/// ```
2016-03-22 22:09:43 +01:00
/// assert!('٣'.is_numeric());
/// assert!('7'.is_numeric());
/// assert!('৬'.is_numeric());
2018-06-17 18:58:01 +02:00
/// assert!('¾'.is_numeric());
/// assert!('①'.is_numeric());
2016-03-22 22:09:43 +01:00
/// assert!(!'K'.is_numeric());
/// assert!(!'و'.is_numeric());
/// assert!(!'藏'.is_numeric());
/// ```
2015-03-11 05:13:16 +01:00
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2015-03-11 05:13:16 +01:00
pub fn is_numeric(self) -> bool {
match self {
'0'..='9' => true,
c => c > '\x7f' && unicode::N(c),
2015-03-11 05:13:16 +01:00
}
}
/// Returns an iterator that yields the lowercase mapping of this `char` as one or more
/// `char`s.
2015-03-11 05:13:16 +01:00
///
/// If this `char` does not have a lowercase mapping, the iterator yields the same `char`.
2015-03-11 05:13:16 +01:00
///
/// If this `char` has a one-to-one lowercase mapping given by the [Unicode Character
/// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`.
2015-03-11 05:13:16 +01:00
///
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
///
/// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields
/// the `char`(s) given by [`SpecialCasing.txt`].
///
/// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
///
/// This operation performs an unconditional mapping without tailoring. That is, the conversion
/// is independent of context and language.
///
/// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in
/// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion.
///
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
///
/// # Examples
///
/// As an iterator:
///
/// ```
/// for c in 'İ'.to_lowercase() {
/// print!("{}", c);
/// }
/// println!();
/// ```
///
/// Using `println!` directly:
///
/// ```
/// println!("{}", 'İ'.to_lowercase());
/// ```
///
/// Both are equivalent to:
///
/// ```
/// println!("i\u{307}");
/// ```
///
/// Using `to_string`:
///
/// ```
/// assert_eq!('C'.to_lowercase().to_string(), "c");
///
/// // Sometimes the result is more than one character:
/// assert_eq!('İ'.to_lowercase().to_string(), "i\u{307}");
///
/// // Characters that do not have both uppercase and lowercase
/// // convert into themselves.
/// assert_eq!('山'.to_lowercase().to_string(), "山");
/// ```
2015-03-11 05:13:16 +01:00
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2015-03-11 05:13:16 +01:00
pub fn to_lowercase(self) -> ToLowercase {
ToLowercase(CaseMappingIter::new(conversions::to_lower(self)))
2015-03-11 05:13:16 +01:00
}
/// Returns an iterator that yields the uppercase mapping of this `char` as one or more
/// `char`s.
///
/// If this `char` does not have a uppercase mapping, the iterator yields the same `char`.
///
/// If this `char` has a one-to-one uppercase mapping given by the [Unicode Character
/// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`.
2015-03-11 05:13:16 +01:00
///
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
2015-03-11 05:13:16 +01:00
///
/// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields
/// the `char`(s) given by [`SpecialCasing.txt`].
2015-03-11 05:13:16 +01:00
///
/// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
2015-03-11 05:13:16 +01:00
///
/// This operation performs an unconditional mapping without tailoring. That is, the conversion
/// is independent of context and language.
2015-03-11 05:13:16 +01:00
///
/// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in
/// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion.
2015-03-11 05:13:16 +01:00
///
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
///
/// # Examples
///
/// As an iterator:
///
/// ```
/// for c in 'ß'.to_uppercase() {
/// print!("{}", c);
/// }
/// println!();
/// ```
///
/// Using `println!` directly:
///
/// ```
/// println!("{}", 'ß'.to_uppercase());
/// ```
///
/// Both are equivalent to:
///
/// ```
/// println!("SS");
/// ```
///
/// Using `to_string`:
///
/// ```
/// assert_eq!('c'.to_uppercase().to_string(), "C");
///
/// // Sometimes the result is more than one character:
/// assert_eq!('ß'.to_uppercase().to_string(), "SS");
///
/// // Characters that do not have both uppercase and lowercase
/// // convert into themselves.
/// assert_eq!('山'.to_uppercase().to_string(), "山");
/// ```
///
/// # Note on locale
///
/// In Turkish, the equivalent of 'i' in Latin has five forms instead of two:
///
/// * 'Dotless': I / ı, sometimes written ï
/// * 'Dotted': İ / i
///
/// Note that the lowercase dotted 'i' is the same as the Latin. Therefore:
///
/// ```
/// let upper_i = 'i'.to_uppercase().to_string();
/// ```
///
/// The value of `upper_i` here relies on the language of the text: if we're
/// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should
/// be `"İ"`. `to_uppercase()` does not take this into account, and so:
///
/// ```
/// let upper_i = 'i'.to_uppercase().to_string();
///
/// assert_eq!(upper_i, "I");
/// ```
///
/// holds across languages.
2015-03-11 05:13:16 +01:00
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2015-03-11 05:13:16 +01:00
pub fn to_uppercase(self) -> ToUppercase {
ToUppercase(CaseMappingIter::new(conversions::to_upper(self)))
2015-03-11 05:13:16 +01:00
}
/// Checks if the value is within the ASCII range.
///
/// # Examples
///
/// ```
/// let ascii = 'a';
/// let non_ascii = '❤';
///
/// assert!(ascii.is_ascii());
/// assert!(!non_ascii.is_ascii());
/// ```
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
2019-12-18 18:00:59 +01:00
#[rustc_const_stable(feature = "const_ascii_methods_on_intrinsics", since = "1.32.0")]
#[inline]
2018-10-23 02:04:14 +02:00
pub const fn is_ascii(&self) -> bool {
*self as u32 <= 0x7F
}
/// Makes a copy of the value in its ASCII upper case equivalent.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
///
2020-10-13 00:18:41 +02:00
/// To uppercase the value in-place, use [`make_ascii_uppercase()`].
///
/// To uppercase ASCII characters in addition to non-ASCII characters, use
2020-10-13 00:18:41 +02:00
/// [`to_uppercase()`].
///
/// # Examples
///
/// ```
/// let ascii = 'a';
/// let non_ascii = '❤';
///
/// assert_eq!('A', ascii.to_ascii_uppercase());
/// assert_eq!('❤', non_ascii.to_ascii_uppercase());
/// ```
///
2020-11-19 14:55:57 +01:00
/// [`make_ascii_uppercase()`]: #method.make_ascii_uppercase
/// [`to_uppercase()`]: #method.to_uppercase
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
#[rustc_const_stable(feature = "const_ascii_methods_on_intrinsics", since = "1.52.0")]
#[inline]
pub const fn to_ascii_uppercase(&self) -> char {
2021-02-12 14:42:42 +01:00
if self.is_ascii_lowercase() {
(*self as u8).ascii_change_case_unchecked() as char
} else {
*self
}
}
/// Makes a copy of the value in its ASCII lower case equivalent.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
///
2020-10-13 00:18:41 +02:00
/// To lowercase the value in-place, use [`make_ascii_lowercase()`].
///
/// To lowercase ASCII characters in addition to non-ASCII characters, use
2020-10-13 00:18:41 +02:00
/// [`to_lowercase()`].
///
/// # Examples
///
/// ```
/// let ascii = 'A';
/// let non_ascii = '❤';
///
/// assert_eq!('a', ascii.to_ascii_lowercase());
/// assert_eq!('❤', non_ascii.to_ascii_lowercase());
/// ```
///
2020-11-19 14:55:57 +01:00
/// [`make_ascii_lowercase()`]: #method.make_ascii_lowercase
/// [`to_lowercase()`]: #method.to_lowercase
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
#[rustc_const_stable(feature = "const_ascii_methods_on_intrinsics", since = "1.52.0")]
#[inline]
pub const fn to_ascii_lowercase(&self) -> char {
2021-02-12 14:42:42 +01:00
if self.is_ascii_uppercase() {
(*self as u8).ascii_change_case_unchecked() as char
} else {
*self
}
}
/// Checks that two values are an ASCII case-insensitive match.
///
/// Equivalent to `to_ascii_lowercase(a) == to_ascii_lowercase(b)`.
///
/// # Examples
///
/// ```
/// let upper_a = 'A';
/// let lower_a = 'a';
/// let lower_z = 'z';
///
/// assert!(upper_a.eq_ignore_ascii_case(&lower_a));
/// assert!(upper_a.eq_ignore_ascii_case(&upper_a));
/// assert!(!upper_a.eq_ignore_ascii_case(&lower_z));
/// ```
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
#[rustc_const_stable(feature = "const_ascii_methods_on_intrinsics", since = "1.52.0")]
#[inline]
pub const fn eq_ignore_ascii_case(&self, other: &char) -> bool {
self.to_ascii_lowercase() == other.to_ascii_lowercase()
}
/// Converts this type to its ASCII upper case equivalent in-place.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
///
/// To return a new uppercased value without modifying the existing one, use
2020-10-13 00:18:41 +02:00
/// [`to_ascii_uppercase()`].
///
/// # Examples
///
/// ```
/// let mut ascii = 'a';
///
/// ascii.make_ascii_uppercase();
///
/// assert_eq!('A', ascii);
/// ```
///
2020-11-19 14:55:57 +01:00
/// [`to_ascii_uppercase()`]: #method.to_ascii_uppercase
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
#[inline]
pub fn make_ascii_uppercase(&mut self) {
*self = self.to_ascii_uppercase();
}
/// Converts this type to its ASCII lower case equivalent in-place.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
///
/// To return a new lowercased value without modifying the existing one, use
2020-10-13 00:18:41 +02:00
/// [`to_ascii_lowercase()`].
///
/// # Examples
///
/// ```
/// let mut ascii = 'A';
///
/// ascii.make_ascii_lowercase();
///
/// assert_eq!('a', ascii);
/// ```
///
2020-11-19 14:55:57 +01:00
/// [`to_ascii_lowercase()`]: #method.to_ascii_lowercase
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
#[inline]
pub fn make_ascii_lowercase(&mut self) {
*self = self.to_ascii_lowercase();
}
/// Checks if the value is an ASCII alphabetic character:
///
2019-06-14 19:24:38 +02:00
/// - U+0041 'A' ..= U+005A 'Z', or
/// - U+0061 'a' ..= U+007A 'z'.
///
/// # Examples
///
/// ```
/// let uppercase_a = 'A';
/// let uppercase_g = 'G';
/// let a = 'a';
/// let g = 'g';
/// let zero = '0';
/// let percent = '%';
/// let space = ' ';
/// let lf = '\n';
/// let esc: char = 0x1b_u8.into();
///
/// assert!(uppercase_a.is_ascii_alphabetic());
/// assert!(uppercase_g.is_ascii_alphabetic());
/// assert!(a.is_ascii_alphabetic());
/// assert!(g.is_ascii_alphabetic());
/// assert!(!zero.is_ascii_alphabetic());
/// assert!(!percent.is_ascii_alphabetic());
/// assert!(!space.is_ascii_alphabetic());
/// assert!(!lf.is_ascii_alphabetic());
/// assert!(!esc.is_ascii_alphabetic());
/// ```
#[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
#[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")]
#[inline]
pub const fn is_ascii_alphabetic(&self) -> bool {
2020-10-05 16:29:07 +02:00
matches!(*self, 'A'..='Z' | 'a'..='z')
}
/// Checks if the value is an ASCII uppercase character:
2019-06-14 19:24:38 +02:00
/// U+0041 'A' ..= U+005A 'Z'.
///
/// # Examples
///
/// ```
/// let uppercase_a = 'A';
/// let uppercase_g = 'G';
/// let a = 'a';
/// let g = 'g';
/// let zero = '0';
/// let percent = '%';
/// let space = ' ';
/// let lf = '\n';
/// let esc: char = 0x1b_u8.into();
///
/// assert!(uppercase_a.is_ascii_uppercase());
/// assert!(uppercase_g.is_ascii_uppercase());
/// assert!(!a.is_ascii_uppercase());
/// assert!(!g.is_ascii_uppercase());
/// assert!(!zero.is_ascii_uppercase());
/// assert!(!percent.is_ascii_uppercase());
/// assert!(!space.is_ascii_uppercase());
/// assert!(!lf.is_ascii_uppercase());
/// assert!(!esc.is_ascii_uppercase());
/// ```
#[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
#[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")]
#[inline]
pub const fn is_ascii_uppercase(&self) -> bool {
2020-10-05 16:29:07 +02:00
matches!(*self, 'A'..='Z')
}
/// Checks if the value is an ASCII lowercase character:
2019-06-14 19:24:38 +02:00
/// U+0061 'a' ..= U+007A 'z'.
///
/// # Examples
///
/// ```
/// let uppercase_a = 'A';
/// let uppercase_g = 'G';
/// let a = 'a';
/// let g = 'g';
/// let zero = '0';
/// let percent = '%';
/// let space = ' ';
/// let lf = '\n';
/// let esc: char = 0x1b_u8.into();
///
/// assert!(!uppercase_a.is_ascii_lowercase());
/// assert!(!uppercase_g.is_ascii_lowercase());
/// assert!(a.is_ascii_lowercase());
/// assert!(g.is_ascii_lowercase());
/// assert!(!zero.is_ascii_lowercase());
/// assert!(!percent.is_ascii_lowercase());
/// assert!(!space.is_ascii_lowercase());
/// assert!(!lf.is_ascii_lowercase());
/// assert!(!esc.is_ascii_lowercase());
/// ```
#[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
#[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")]
#[inline]
pub const fn is_ascii_lowercase(&self) -> bool {
2020-10-05 16:29:07 +02:00
matches!(*self, 'a'..='z')
}
/// Checks if the value is an ASCII alphanumeric character:
///
2019-06-14 19:24:38 +02:00
/// - U+0041 'A' ..= U+005A 'Z', or
/// - U+0061 'a' ..= U+007A 'z', or
/// - U+0030 '0' ..= U+0039 '9'.
///
/// # Examples
///
/// ```
/// let uppercase_a = 'A';
/// let uppercase_g = 'G';
/// let a = 'a';
/// let g = 'g';
/// let zero = '0';
/// let percent = '%';
/// let space = ' ';
/// let lf = '\n';
/// let esc: char = 0x1b_u8.into();
///
/// assert!(uppercase_a.is_ascii_alphanumeric());
/// assert!(uppercase_g.is_ascii_alphanumeric());
/// assert!(a.is_ascii_alphanumeric());
/// assert!(g.is_ascii_alphanumeric());
/// assert!(zero.is_ascii_alphanumeric());
/// assert!(!percent.is_ascii_alphanumeric());
/// assert!(!space.is_ascii_alphanumeric());
/// assert!(!lf.is_ascii_alphanumeric());
/// assert!(!esc.is_ascii_alphanumeric());
/// ```
#[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
#[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")]
#[inline]
pub const fn is_ascii_alphanumeric(&self) -> bool {
2020-10-05 16:29:07 +02:00
matches!(*self, '0'..='9' | 'A'..='Z' | 'a'..='z')
}
/// Checks if the value is an ASCII decimal digit:
2019-06-14 19:24:38 +02:00
/// U+0030 '0' ..= U+0039 '9'.
///
/// # Examples
///
/// ```
/// let uppercase_a = 'A';
/// let uppercase_g = 'G';
/// let a = 'a';
/// let g = 'g';
/// let zero = '0';
/// let percent = '%';
/// let space = ' ';
/// let lf = '\n';
/// let esc: char = 0x1b_u8.into();
///
/// assert!(!uppercase_a.is_ascii_digit());
/// assert!(!uppercase_g.is_ascii_digit());
/// assert!(!a.is_ascii_digit());
/// assert!(!g.is_ascii_digit());
/// assert!(zero.is_ascii_digit());
/// assert!(!percent.is_ascii_digit());
/// assert!(!space.is_ascii_digit());
/// assert!(!lf.is_ascii_digit());
/// assert!(!esc.is_ascii_digit());
/// ```
#[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
#[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")]
#[inline]
pub const fn is_ascii_digit(&self) -> bool {
2020-10-05 16:29:07 +02:00
matches!(*self, '0'..='9')
}
/// Checks if the value is an ASCII hexadecimal digit:
///
2019-06-14 19:24:38 +02:00
/// - U+0030 '0' ..= U+0039 '9', or
/// - U+0041 'A' ..= U+0046 'F', or
/// - U+0061 'a' ..= U+0066 'f'.
///
/// # Examples
///
/// ```
/// let uppercase_a = 'A';
/// let uppercase_g = 'G';
/// let a = 'a';
/// let g = 'g';
/// let zero = '0';
/// let percent = '%';
/// let space = ' ';
/// let lf = '\n';
/// let esc: char = 0x1b_u8.into();
///
/// assert!(uppercase_a.is_ascii_hexdigit());
/// assert!(!uppercase_g.is_ascii_hexdigit());
/// assert!(a.is_ascii_hexdigit());
/// assert!(!g.is_ascii_hexdigit());
/// assert!(zero.is_ascii_hexdigit());
/// assert!(!percent.is_ascii_hexdigit());
/// assert!(!space.is_ascii_hexdigit());
/// assert!(!lf.is_ascii_hexdigit());
/// assert!(!esc.is_ascii_hexdigit());
/// ```
#[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
#[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")]
#[inline]
pub const fn is_ascii_hexdigit(&self) -> bool {
2020-10-05 16:29:07 +02:00
matches!(*self, '0'..='9' | 'A'..='F' | 'a'..='f')
}
/// Checks if the value is an ASCII punctuation character:
///
2019-06-14 19:24:38 +02:00
/// - U+0021 ..= U+002F `! " # $ % & ' ( ) * + , - . /`, or
/// - U+003A ..= U+0040 `: ; < = > ? @`, or
/// - U+005B ..= U+0060 ``[ \ ] ^ _ ` ``, or
/// - U+007B ..= U+007E `{ | } ~`
///
/// # Examples
///
/// ```
/// let uppercase_a = 'A';
/// let uppercase_g = 'G';
/// let a = 'a';
/// let g = 'g';
/// let zero = '0';
/// let percent = '%';
/// let space = ' ';
/// let lf = '\n';
/// let esc: char = 0x1b_u8.into();
///
/// assert!(!uppercase_a.is_ascii_punctuation());
/// assert!(!uppercase_g.is_ascii_punctuation());
/// assert!(!a.is_ascii_punctuation());
/// assert!(!g.is_ascii_punctuation());
/// assert!(!zero.is_ascii_punctuation());
/// assert!(percent.is_ascii_punctuation());
/// assert!(!space.is_ascii_punctuation());
/// assert!(!lf.is_ascii_punctuation());
/// assert!(!esc.is_ascii_punctuation());
/// ```
#[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
#[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")]
#[inline]
pub const fn is_ascii_punctuation(&self) -> bool {
2020-10-05 16:29:07 +02:00
matches!(*self, '!'..='/' | ':'..='@' | '['..='`' | '{'..='~')
}
/// Checks if the value is an ASCII graphic character:
2019-06-14 19:24:38 +02:00
/// U+0021 '!' ..= U+007E '~'.
///
/// # Examples
///
/// ```
/// let uppercase_a = 'A';
/// let uppercase_g = 'G';
/// let a = 'a';
/// let g = 'g';
/// let zero = '0';
/// let percent = '%';
/// let space = ' ';
/// let lf = '\n';
/// let esc: char = 0x1b_u8.into();
///
/// assert!(uppercase_a.is_ascii_graphic());
/// assert!(uppercase_g.is_ascii_graphic());
/// assert!(a.is_ascii_graphic());
/// assert!(g.is_ascii_graphic());
/// assert!(zero.is_ascii_graphic());
/// assert!(percent.is_ascii_graphic());
/// assert!(!space.is_ascii_graphic());
/// assert!(!lf.is_ascii_graphic());
/// assert!(!esc.is_ascii_graphic());
/// ```
#[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
#[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")]
#[inline]
pub const fn is_ascii_graphic(&self) -> bool {
2020-10-05 16:29:07 +02:00
matches!(*self, '!'..='~')
}
/// Checks if the value is an ASCII whitespace character:
/// U+0020 SPACE, U+0009 HORIZONTAL TAB, U+000A LINE FEED,
/// U+000C FORM FEED, or U+000D CARRIAGE RETURN.
///
/// Rust uses the WhatWG Infra Standard's [definition of ASCII
/// whitespace][infra-aw]. There are several other definitions in
/// wide use. For instance, [the POSIX locale][pct] includes
/// U+000B VERTICAL TAB as well as all the above characters,
/// but—from the very same specification—[the default rule for
/// "field splitting" in the Bourne shell][bfs] considers *only*
/// SPACE, HORIZONTAL TAB, and LINE FEED as whitespace.
///
/// If you are writing a program that will process an existing
/// file format, check what that format's definition of whitespace is
/// before using this function.
///
/// [infra-aw]: https://infra.spec.whatwg.org/#ascii-whitespace
/// [pct]: http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01
/// [bfs]: http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_06_05
///
/// # Examples
///
/// ```
/// let uppercase_a = 'A';
/// let uppercase_g = 'G';
/// let a = 'a';
/// let g = 'g';
/// let zero = '0';
/// let percent = '%';
/// let space = ' ';
/// let lf = '\n';
/// let esc: char = 0x1b_u8.into();
///
/// assert!(!uppercase_a.is_ascii_whitespace());
/// assert!(!uppercase_g.is_ascii_whitespace());
/// assert!(!a.is_ascii_whitespace());
/// assert!(!g.is_ascii_whitespace());
/// assert!(!zero.is_ascii_whitespace());
/// assert!(!percent.is_ascii_whitespace());
/// assert!(space.is_ascii_whitespace());
/// assert!(lf.is_ascii_whitespace());
/// assert!(!esc.is_ascii_whitespace());
/// ```
#[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
#[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")]
#[inline]
pub const fn is_ascii_whitespace(&self) -> bool {
2020-10-05 16:29:07 +02:00
matches!(*self, '\t' | '\n' | '\x0C' | '\r' | ' ')
}
/// Checks if the value is an ASCII control character:
2019-06-14 19:24:38 +02:00
/// U+0000 NUL ..= U+001F UNIT SEPARATOR, or U+007F DELETE.
/// Note that most ASCII whitespace characters are control
/// characters, but SPACE is not.
///
/// # Examples
///
/// ```
/// let uppercase_a = 'A';
/// let uppercase_g = 'G';
/// let a = 'a';
/// let g = 'g';
/// let zero = '0';
/// let percent = '%';
/// let space = ' ';
/// let lf = '\n';
/// let esc: char = 0x1b_u8.into();
///
/// assert!(!uppercase_a.is_ascii_control());
/// assert!(!uppercase_g.is_ascii_control());
/// assert!(!a.is_ascii_control());
/// assert!(!g.is_ascii_control());
/// assert!(!zero.is_ascii_control());
/// assert!(!percent.is_ascii_control());
/// assert!(!space.is_ascii_control());
/// assert!(lf.is_ascii_control());
/// assert!(esc.is_ascii_control());
/// ```
#[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
#[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")]
#[inline]
pub const fn is_ascii_control(&self) -> bool {
2020-10-05 16:29:07 +02:00
matches!(*self, '\0'..='\x1F' | '\x7F')
}
2015-03-11 05:13:16 +01:00
}
#[inline]
const fn len_utf8(code: u32) -> usize {
if code < MAX_ONE_B {
1
} else if code < MAX_TWO_B {
2
} else if code < MAX_THREE_B {
3
} else {
4
}
}
/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
/// and then returns the subslice of the buffer that contains the encoded character.
///
/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
/// (Creating a `char` in the surrogate range is UB.)
/// The result is valid [generalized UTF-8] but not valid UTF-8.
///
/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
///
/// # Panics
///
/// Panics if the buffer is not large enough.
/// A buffer of length four is large enough to encode any `char`.
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
#[doc(hidden)]
#[inline]
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
let len = len_utf8(code);
match (len, &mut dst[..]) {
(1, [a, ..]) => {
*a = code as u8;
}
(2, [a, b, ..]) => {
*a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
*b = (code & 0x3F) as u8 | TAG_CONT;
}
(3, [a, b, c, ..]) => {
*a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
*b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
*c = (code & 0x3F) as u8 | TAG_CONT;
}
(4, [a, b, c, d, ..]) => {
*a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
*b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
*c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
*d = (code & 0x3F) as u8 | TAG_CONT;
}
_ => panic!(
"encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
len,
code,
dst.len(),
),
};
&mut dst[..len]
}
/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
/// and then returns the subslice of the buffer that contains the encoded character.
///
/// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range.
/// (Creating a `char` in the surrogate range is UB.)
///
/// # Panics
///
/// Panics if the buffer is not large enough.
/// A buffer of length 2 is large enough to encode any `char`.
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
#[doc(hidden)]
#[inline]
pub fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] {
// SAFETY: each arm checks whether there are enough bits to write into
unsafe {
if (code & 0xFFFF) == code && !dst.is_empty() {
// The BMP falls through
*dst.get_unchecked_mut(0) = code as u16;
slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
} else if dst.len() >= 2 {
// Supplementary planes break into surrogates.
code -= 0x1_0000;
*dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16);
*dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF);
slice::from_raw_parts_mut(dst.as_mut_ptr(), 2)
} else {
panic!(
"encode_utf16: need {} units to encode U+{:X}, but the buffer has {}",
from_u32_unchecked(code).len_utf16(),
code,
dst.len(),
)
}
}
}