From 9bae6ec828fdc7f87838ee008cccef90e31b9f84 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Wed, 30 Apr 2014 23:06:36 -0700 Subject: [PATCH] core: Inherit possible string functionality This moves as much allocation as possible from teh std::str module into core::str. This includes essentially all non-allocating functionality, mostly iterators and slicing and such. This primarily splits the Str trait into only having the as_slice() method, adding a new StrAllocating trait to std::str which contains the relevant new allocation methods. This is a breaking change if any of the methods of "trait Str" were overriden. The old functionality can be restored by implementing both the Str and StrAllocating traits. [breaking-change] --- src/libcore/lib.rs | 1 + src/libcore/str.rs | 1861 +++++++++++++++++++++++++++++ src/libstd/fmt/mod.rs | 21 +- src/libstd/fmt/num.rs | 2 +- src/libstd/hash/sip.rs | 2 +- src/libstd/io/mod.rs | 2 +- src/libstd/io/process.rs | 1 - src/libstd/local_data.rs | 1 - src/libstd/os.rs | 2 +- src/libstd/path/windows.rs | 6 +- src/libstd/prelude.rs | 1 + src/libstd/repr.rs | 1 + src/libstd/rt/args.rs | 1 - src/libstd/str.rs | 2023 ++------------------------------ src/libstd/strbuf.rs | 4 +- src/libstd/task.rs | 2 +- src/libstd/to_str.rs | 2 +- src/libsyntax/util/interner.rs | 5 - 18 files changed, 1986 insertions(+), 1952 deletions(-) create mode 100644 src/libcore/str.rs diff --git a/src/libcore/lib.rs b/src/libcore/lib.rs index b41e32de095..e89f5181402 100644 --- a/src/libcore/lib.rs +++ b/src/libcore/lib.rs @@ -71,4 +71,5 @@ pub mod option; pub mod raw; pub mod char; pub mod slice; +pub mod str; pub mod tuple; diff --git a/src/libcore/str.rs b/src/libcore/str.rs new file mode 100644 index 00000000000..9d9e12d0ad5 --- /dev/null +++ b/src/libcore/str.rs @@ -0,0 +1,1861 @@ +// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! String manipulation +//! +//! For more details, see std::str + +use cast::transmute; +use cast; +use char; +use clone::Clone; +use cmp::{Eq, TotalEq}; +use container::Container; +use default::Default; +use iter::{Filter, Map, Iterator}; +use iter::{Rev, DoubleEndedIterator, ExactSize}; +use num::Saturating; +use option::{None, Option, Some}; +use raw::Repr; +use slice::{ImmutableVector, Vector}; +use slice; + +/* +Section: Creating a string +*/ + +/// Converts a vector to a string slice without performing any allocations. +/// +/// Once the slice has been validated as utf-8, it is transmuted in-place and +/// returned as a '&str' instead of a '&[u8]' +/// +/// Returns None if the slice is not utf-8. +pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a str> { + if is_utf8(v) { + Some(unsafe { raw::from_utf8(v) }) + } else { None } +} + +/// Something that can be used to compare against a character +pub trait CharEq { + /// Determine if the splitter should split at the given character + fn matches(&mut self, char) -> bool; + /// Indicate if this is only concerned about ASCII characters, + /// which can allow for a faster implementation. + fn only_ascii(&self) -> bool; +} + +impl CharEq for char { + #[inline] + fn matches(&mut self, c: char) -> bool { *self == c } + + #[inline] + fn only_ascii(&self) -> bool { (*self as uint) < 128 } +} + +impl<'a> CharEq for |char|: 'a -> bool { + #[inline] + fn matches(&mut self, c: char) -> bool { (*self)(c) } + + #[inline] + fn only_ascii(&self) -> bool { false } +} + +impl CharEq for extern "Rust" fn(char) -> bool { + #[inline] + fn matches(&mut self, c: char) -> bool { (*self)(c) } + + #[inline] + fn only_ascii(&self) -> bool { false } +} + +impl<'a> CharEq for &'a [char] { + #[inline] + fn matches(&mut self, c: char) -> bool { + self.iter().any(|&mut m| m.matches(c)) + } + + #[inline] + fn only_ascii(&self) -> bool { + self.iter().all(|m| m.only_ascii()) + } +} + +/* +Section: Iterators +*/ + +/// External iterator for a string's characters. +/// Use with the `std::iter` module. +#[deriving(Clone)] +pub struct Chars<'a> { + /// The slice remaining to be iterated + string: &'a str, +} + +impl<'a> Iterator for Chars<'a> { + #[inline] + fn next(&mut self) -> Option { + // Decode the next codepoint, then update + // the slice to be just the remaining part + if self.string.len() != 0 { + let CharRange {ch, next} = self.string.char_range_at(0); + unsafe { + self.string = raw::slice_unchecked(self.string, next, self.string.len()); + } + Some(ch) + } else { + None + } + } + + #[inline] + fn size_hint(&self) -> (uint, Option) { + (self.string.len().saturating_add(3)/4, Some(self.string.len())) + } +} + +impl<'a> DoubleEndedIterator for Chars<'a> { + #[inline] + fn next_back(&mut self) -> Option { + if self.string.len() != 0 { + let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len()); + unsafe { + self.string = raw::slice_unchecked(self.string, 0, next); + } + Some(ch) + } else { + None + } + } +} + +/// External iterator for a string's characters and their byte offsets. +/// Use with the `std::iter` module. +#[deriving(Clone)] +pub struct CharOffsets<'a> { + /// The original string to be iterated + string: &'a str, + iter: Chars<'a>, +} + +impl<'a> Iterator<(uint, char)> for CharOffsets<'a> { + #[inline] + fn next(&mut self) -> Option<(uint, char)> { + // Compute the byte offset by using the pointer offset between + // the original string slice and the iterator's remaining part + let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint; + self.iter.next().map(|ch| (offset, ch)) + } + + #[inline] + fn size_hint(&self) -> (uint, Option) { + self.iter.size_hint() + } +} + +impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> { + #[inline] + fn next_back(&mut self) -> Option<(uint, char)> { + self.iter.next_back().map(|ch| { + let offset = self.iter.string.len() + + self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint; + (offset, ch) + }) + } +} + +#[deprecated = "replaced by Rev>"] +pub type RevChars<'a> = Rev>; + +#[deprecated = "replaced by Rev>"] +pub type RevCharOffsets<'a> = Rev>; + +/// External iterator for a string's bytes. +/// Use with the `std::iter` module. +pub type Bytes<'a> = + Map<'a, &'a u8, u8, slice::Items<'a, u8>>; + +#[deprecated = "replaced by Rev>"] +pub type RevBytes<'a> = Rev>; + +/// An iterator over the substrings of a string, separated by `sep`. +#[deriving(Clone)] +pub struct CharSplits<'a, Sep> { + /// The slice remaining to be iterated + string: &'a str, + sep: Sep, + /// Whether an empty string at the end is allowed + allow_trailing_empty: bool, + only_ascii: bool, + finished: bool, +} + +#[deprecated = "replaced by Rev>"] +pub type RevCharSplits<'a, Sep> = Rev>; + +/// An iterator over the substrings of a string, separated by `sep`, +/// splitting at most `count` times. +#[deriving(Clone)] +pub struct CharSplitsN<'a, Sep> { + iter: CharSplits<'a, Sep>, + /// The number of splits remaining + count: uint, + invert: bool, +} + +/// An iterator over the words of a string, separated by a sequence of whitespace +pub type Words<'a> = + Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>; + +/// An iterator over the lines of a string, separated by either `\n` or (`\r\n`). +pub type AnyLines<'a> = + Map<'a, &'a str, &'a str, CharSplits<'a, char>>; + +impl<'a, Sep> CharSplits<'a, Sep> { + #[inline] + fn get_end(&mut self) -> Option<&'a str> { + if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) { + self.finished = true; + Some(self.string) + } else { + None + } + } +} + +impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplits<'a, Sep> { + #[inline] + fn next(&mut self) -> Option<&'a str> { + if self.finished { return None } + + let mut next_split = None; + if self.only_ascii { + for (idx, byte) in self.string.bytes().enumerate() { + if self.sep.matches(byte as char) && byte < 128u8 { + next_split = Some((idx, idx + 1)); + break; + } + } + } else { + for (idx, ch) in self.string.char_indices() { + if self.sep.matches(ch) { + next_split = Some((idx, self.string.char_range_at(idx).next)); + break; + } + } + } + match next_split { + Some((a, b)) => unsafe { + let elt = raw::slice_unchecked(self.string, 0, a); + self.string = raw::slice_unchecked(self.string, b, self.string.len()); + Some(elt) + }, + None => self.get_end(), + } + } +} + +impl<'a, Sep: CharEq> DoubleEndedIterator<&'a str> +for CharSplits<'a, Sep> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + if self.finished { return None } + + if !self.allow_trailing_empty { + self.allow_trailing_empty = true; + match self.next_back() { + Some(elt) if !elt.is_empty() => return Some(elt), + _ => if self.finished { return None } + } + } + let len = self.string.len(); + let mut next_split = None; + + if self.only_ascii { + for (idx, byte) in self.string.bytes().enumerate().rev() { + if self.sep.matches(byte as char) && byte < 128u8 { + next_split = Some((idx, idx + 1)); + break; + } + } + } else { + for (idx, ch) in self.string.char_indices().rev() { + if self.sep.matches(ch) { + next_split = Some((idx, self.string.char_range_at(idx).next)); + break; + } + } + } + match next_split { + Some((a, b)) => unsafe { + let elt = raw::slice_unchecked(self.string, b, len); + self.string = raw::slice_unchecked(self.string, 0, a); + Some(elt) + }, + None => { self.finished = true; Some(self.string) } + } + } +} + +impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> { + #[inline] + fn next(&mut self) -> Option<&'a str> { + if self.count != 0 { + self.count -= 1; + if self.invert { self.iter.next_back() } else { self.iter.next() } + } else { + self.iter.get_end() + } + } +} + +/// An iterator over the start and end indices of the matches of a +/// substring within a larger string +#[deriving(Clone)] +pub struct MatchIndices<'a> { + haystack: &'a str, + needle: &'a str, + position: uint, +} + +/// An iterator over the substrings of a string separated by a given +/// search string +#[deriving(Clone)] +pub struct StrSplits<'a> { + it: MatchIndices<'a>, + last_end: uint, + finished: bool +} + +impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> { + #[inline] + fn next(&mut self) -> Option<(uint, uint)> { + // See Issue #1932 for why this is a naive search + let (h_len, n_len) = (self.haystack.len(), self.needle.len()); + let mut match_start = 0; + let mut match_i = 0; + + while self.position < h_len { + if self.haystack[self.position] == self.needle[match_i] { + if match_i == 0 { match_start = self.position; } + match_i += 1; + self.position += 1; + + if match_i == n_len { + // found a match! + return Some((match_start, self.position)); + } + } else { + // failed match, backtrack + if match_i > 0 { + match_i = 0; + self.position = match_start; + } + self.position += 1; + } + } + None + } +} + +impl<'a> Iterator<&'a str> for StrSplits<'a> { + #[inline] + fn next(&mut self) -> Option<&'a str> { + if self.finished { return None; } + + match self.it.next() { + Some((from, to)) => { + let ret = Some(self.it.haystack.slice(self.last_end, from)); + self.last_end = to; + ret + } + None => { + self.finished = true; + Some(self.it.haystack.slice(self.last_end, self.it.haystack.len())) + } + } + } +} + +/* +Section: Comparing strings +*/ + +// share the implementation of the lang-item vs. non-lang-item +// eq_slice. +#[inline] +fn eq_slice_(a: &str, b: &str) -> bool { + #[allow(ctypes)] + extern { fn memcmp(s1: *i8, s2: *i8, n: uint) -> i32; } + a.len() == b.len() && unsafe { + memcmp(a.as_ptr() as *i8, + b.as_ptr() as *i8, + a.len()) == 0 + } +} + +/// Bytewise slice equality +#[cfg(not(test))] +#[lang="str_eq"] +#[inline] +pub fn eq_slice(a: &str, b: &str) -> bool { + eq_slice_(a, b) +} + +/// Bytewise slice equality +#[cfg(test)] +#[inline] +pub fn eq_slice(a: &str, b: &str) -> bool { + eq_slice_(a, b) +} + +/// Bytewise string equality +#[cfg(not(test))] +#[lang="uniq_str_eq"] +#[inline] +pub fn eq(a: &~str, b: &~str) -> bool { + eq_slice(*a, *b) +} + +#[cfg(test)] +#[inline] +pub fn eq(a: &~str, b: &~str) -> bool { + eq_slice(*a, *b) +} + +/* +Section: Misc +*/ + +/// Walk through `iter` checking that it's a valid UTF-8 sequence, +/// returning `true` in that case, or, if it is invalid, `false` with +/// `iter` reset such that it is pointing at the first byte in the +/// invalid sequence. +#[inline(always)] +fn run_utf8_validation_iterator(iter: &mut slice::Items) -> bool { + loop { + // save the current thing we're pointing at. + let old = *iter; + + // restore the iterator we had at the start of this codepoint. + macro_rules! err ( () => { {*iter = old; return false} }); + macro_rules! next ( () => { + match iter.next() { + Some(a) => *a, + // we needed data, but there was none: error! + None => err!() + } + }); + + let first = match iter.next() { + Some(&b) => b, + // we're at the end of the iterator and a codepoint + // boundary at the same time, so this string is valid. + None => return true + }; + + // ASCII characters are always valid, so only large + // bytes need more examination. + if first >= 128 { + let w = utf8_char_width(first); + let second = next!(); + // 2-byte encoding is for codepoints \u0080 to \u07ff + // first C2 80 last DF BF + // 3-byte encoding is for codepoints \u0800 to \uffff + // first E0 A0 80 last EF BF BF + // excluding surrogates codepoints \ud800 to \udfff + // ED A0 80 to ED BF BF + // 4-byte encoding is for codepoints \u10000 to \u10ffff + // first F0 90 80 80 last F4 8F BF BF + // + // Use the UTF-8 syntax from the RFC + // + // https://tools.ietf.org/html/rfc3629 + // UTF8-1 = %x00-7F + // UTF8-2 = %xC2-DF UTF8-tail + // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + // %xF4 %x80-8F 2( UTF8-tail ) + match w { + 2 => if second & 192 != TAG_CONT_U8 {err!()}, + 3 => { + match (first, second, next!() & 192) { + (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) | + (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) | + (0xED , 0x80 .. 0x9F, TAG_CONT_U8) | + (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => {} + _ => err!() + } + } + 4 => { + match (first, second, next!() & 192, next!() & 192) { + (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) | + (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) | + (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {} + _ => err!() + } + } + _ => err!() + } + } + } +} + +/// Determines if a vector of bytes contains valid UTF-8. +pub fn is_utf8(v: &[u8]) -> bool { + run_utf8_validation_iterator(&mut v.iter()) +} + +/// Determines if a vector of `u16` contains valid UTF-16 +pub fn is_utf16(v: &[u16]) -> bool { + let mut it = v.iter(); + macro_rules! next ( ($ret:expr) => { + match it.next() { Some(u) => *u, None => return $ret } + } + ) + loop { + let u = next!(true); + + match char::from_u32(u as u32) { + Some(_) => {} + None => { + let u2 = next!(false); + if u < 0xD7FF || u > 0xDBFF || + u2 < 0xDC00 || u2 > 0xDFFF { return false; } + } + } + } +} + +/// An iterator that decodes UTF-16 encoded codepoints from a vector +/// of `u16`s. +#[deriving(Clone)] +pub struct UTF16Items<'a> { + iter: slice::Items<'a, u16> +} +/// The possibilities for values decoded from a `u16` stream. +#[deriving(Eq, TotalEq, Clone)] +pub enum UTF16Item { + /// A valid codepoint. + ScalarValue(char), + /// An invalid surrogate without its pair. + LoneSurrogate(u16) +} + +impl UTF16Item { + /// Convert `self` to a `char`, taking `LoneSurrogate`s to the + /// replacement character (U+FFFD). + #[inline] + pub fn to_char_lossy(&self) -> char { + match *self { + ScalarValue(c) => c, + LoneSurrogate(_) => '\uFFFD' + } + } +} + +impl<'a> Iterator for UTF16Items<'a> { + fn next(&mut self) -> Option { + let u = match self.iter.next() { + Some(u) => *u, + None => return None + }; + + if u < 0xD800 || 0xDFFF < u { + // not a surrogate + Some(ScalarValue(unsafe {cast::transmute(u as u32)})) + } else if u >= 0xDC00 { + // a trailing surrogate + Some(LoneSurrogate(u)) + } else { + // preserve state for rewinding. + let old = self.iter; + + let u2 = match self.iter.next() { + Some(u2) => *u2, + // eof + None => return Some(LoneSurrogate(u)) + }; + if u2 < 0xDC00 || u2 > 0xDFFF { + // not a trailing surrogate so we're not a valid + // surrogate pair, so rewind to redecode u2 next time. + self.iter = old; + return Some(LoneSurrogate(u)) + } + + // all ok, so lets decode it. + let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000; + Some(ScalarValue(unsafe {cast::transmute(c)})) + } + } + + #[inline] + fn size_hint(&self) -> (uint, Option) { + let (low, high) = self.iter.size_hint(); + // we could be entirely valid surrogates (2 elements per + // char), or entirely non-surrogates (1 element per char) + (low / 2, high) + } +} + +/// Create an iterator over the UTF-16 encoded codepoints in `v`, +/// returning invalid surrogates as `LoneSurrogate`s. +/// +/// # Example +/// +/// ```rust +/// use std::str; +/// use std::str::{ScalarValue, LoneSurrogate}; +/// +/// // 𝄞music +/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, +/// 0x0073, 0xDD1E, 0x0069, 0x0063, +/// 0xD834]; +/// +/// assert_eq!(str::utf16_items(v).collect::<~[_]>(), +/// ~[ScalarValue('𝄞'), +/// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'), +/// LoneSurrogate(0xDD1E), +/// ScalarValue('i'), ScalarValue('c'), +/// LoneSurrogate(0xD834)]); +/// ``` +pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> { + UTF16Items { iter : v.iter() } +} + +/// Return a slice of `v` ending at (and not including) the first NUL +/// (0). +/// +/// # Example +/// +/// ```rust +/// use std::str; +/// +/// // "abcd" +/// let mut v = ['a' as u16, 'b' as u16, 'c' as u16, 'd' as u16]; +/// // no NULs so no change +/// assert_eq!(str::truncate_utf16_at_nul(v), v.as_slice()); +/// +/// // "ab\0d" +/// v[2] = 0; +/// assert_eq!(str::truncate_utf16_at_nul(v), +/// &['a' as u16, 'b' as u16]); +/// ``` +pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] { + match v.iter().position(|c| *c == 0) { + // don't include the 0 + Some(i) => v.slice_to(i), + None => v + } +} + +// https://tools.ietf.org/html/rfc3629 +static UTF8_CHAR_WIDTH: [u8, ..256] = [ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF +0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF +4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF +]; + +/// Given a first byte, determine how many bytes are in this UTF-8 character +#[inline] +pub fn utf8_char_width(b: u8) -> uint { + return UTF8_CHAR_WIDTH[b as uint] as uint; +} + +/// Struct that contains a `char` and the index of the first byte of +/// the next `char` in a string. This can be used as a data structure +/// for iterating over the UTF-8 bytes of a string. +pub struct CharRange { + /// Current `char` + pub ch: char, + /// Index of the first byte of the next `char` + pub next: uint, +} + +// Return the initial codepoint accumulator for the first byte. +// The first byte is special, only want bottom 5 bits for width 2, 4 bits +// for width 3, and 3 bits for width 4 +macro_rules! utf8_first_byte( + ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32) +) + +// return the value of $ch updated with continuation byte $byte +macro_rules! utf8_acc_cont_byte( + ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32) +) + +static TAG_CONT_U8: u8 = 128u8; + +/// Unsafe operations +pub mod raw { + use cast; + use container::Container; + use iter::Iterator; + use ptr::RawPtr; + use raw::Slice; + use slice::{ImmutableVector}; + use str::{is_utf8, StrSlice}; + + /// Converts a slice of bytes to a string slice without checking + /// that the string contains valid UTF-8. + pub unsafe fn from_utf8<'a>(v: &'a [u8]) -> &'a str { + cast::transmute(v) + } + + /// Form a slice from a C string. Unsafe because the caller must ensure the + /// C string has the static lifetime, or else the return value may be + /// invalidated later. + pub unsafe fn c_str_to_static_slice(s: *i8) -> &'static str { + let s = s as *u8; + let mut curr = s; + let mut len = 0u; + while *curr != 0u8 { + len += 1u; + curr = s.offset(len as int); + } + let v = Slice { data: s, len: len }; + assert!(is_utf8(::cast::transmute(v))); + ::cast::transmute(v) + } + + /// Takes a bytewise (not UTF-8) slice from a string. + /// + /// Returns the substring from [`begin`..`end`). + /// + /// # Failure + /// + /// If begin is greater than end. + /// If end is greater than the length of the string. + #[inline] + pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str { + assert!(begin <= end); + assert!(end <= s.len()); + slice_unchecked(s, begin, end) + } + + /// Takes a bytewise (not UTF-8) slice from a string. + /// + /// Returns the substring from [`begin`..`end`). + /// + /// Caller must check slice boundaries! + #[inline] + pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str { + cast::transmute(Slice { + data: s.as_ptr().offset(begin as int), + len: end - begin, + }) + } +} + +/* +Section: Trait implementations +*/ + +#[cfg(not(test))] +#[allow(missing_doc)] +pub mod traits { + use container::Container; + use cmp::{TotalOrd, Ordering, Less, Equal, Greater, Eq, Ord, Equiv, TotalEq}; + use iter::Iterator; + use option::{Some, None}; + use str::{Str, StrSlice, eq_slice}; + + impl<'a> TotalOrd for &'a str { + #[inline] + fn cmp(&self, other: & &'a str) -> Ordering { + for (s_b, o_b) in self.bytes().zip(other.bytes()) { + match s_b.cmp(&o_b) { + Greater => return Greater, + Less => return Less, + Equal => () + } + } + + self.len().cmp(&other.len()) + } + } + + impl TotalOrd for ~str { + #[inline] + fn cmp(&self, other: &~str) -> Ordering { self.as_slice().cmp(&other.as_slice()) } + } + + impl<'a> Eq for &'a str { + #[inline] + fn eq(&self, other: & &'a str) -> bool { + eq_slice((*self), (*other)) + } + #[inline] + fn ne(&self, other: & &'a str) -> bool { !(*self).eq(other) } + } + + impl Eq for ~str { + #[inline] + fn eq(&self, other: &~str) -> bool { + eq_slice((*self), (*other)) + } + } + + impl<'a> TotalEq for &'a str {} + + impl TotalEq for ~str {} + + impl<'a> Ord for &'a str { + #[inline] + fn lt(&self, other: & &'a str) -> bool { self.cmp(other) == Less } + } + + impl Ord for ~str { + #[inline] + fn lt(&self, other: &~str) -> bool { self.cmp(other) == Less } + } + + impl<'a, S: Str> Equiv for &'a str { + #[inline] + fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) } + } + + impl<'a, S: Str> Equiv for ~str { + #[inline] + fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) } + } +} + +#[cfg(test)] +pub mod traits {} + +/// Any string that can be represented as a slice +pub trait Str { + /// Work with `self` as a slice. + fn as_slice<'a>(&'a self) -> &'a str; +} + +impl<'a> Str for &'a str { + #[inline] + fn as_slice<'a>(&'a self) -> &'a str { *self } +} + +impl<'a> Str for ~str { + #[inline] + fn as_slice<'a>(&'a self) -> &'a str { let s: &'a str = *self; s } +} + +impl<'a> Container for &'a str { + #[inline] + fn len(&self) -> uint { + self.repr().len + } +} + +impl Container for ~str { + #[inline] + fn len(&self) -> uint { self.as_slice().len() } +} + +/// Methods for string slices +pub trait StrSlice<'a> { + /// Returns true if one string contains another + /// + /// # Arguments + /// + /// - needle - The string to look for + fn contains<'a>(&self, needle: &'a str) -> bool; + + /// Returns true if a string contains a char. + /// + /// # Arguments + /// + /// - needle - The char to look for + fn contains_char(&self, needle: char) -> bool; + + /// An iterator over the characters of `self`. Note, this iterates + /// over unicode code-points, not unicode graphemes. + /// + /// # Example + /// + /// ```rust + /// let v: ~[char] = "abc åäö".chars().collect(); + /// assert_eq!(v, ~['a', 'b', 'c', ' ', 'å', 'ä', 'ö']); + /// ``` + fn chars(&self) -> Chars<'a>; + + /// Do not use this - it is deprecated. + #[deprecated = "replaced by .chars().rev()"] + fn chars_rev(&self) -> Rev>; + + /// An iterator over the bytes of `self` + fn bytes(&self) -> Bytes<'a>; + + /// Do not use this - it is deprecated. + #[deprecated = "replaced by .bytes().rev()"] + fn bytes_rev(&self) -> Rev>; + + /// An iterator over the characters of `self` and their byte offsets. + fn char_indices(&self) -> CharOffsets<'a>; + + /// Do not use this - it is deprecated. + #[deprecated = "replaced by .char_indices().rev()"] + fn char_indices_rev(&self) -> Rev>; + + /// An iterator over substrings of `self`, separated by characters + /// matched by `sep`. + /// + /// # Example + /// + /// ```rust + /// let v: ~[&str] = "Mary had a little lamb".split(' ').collect(); + /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]); + /// + /// let v: ~[&str] = "abc1def2ghi".split(|c: char| c.is_digit()).collect(); + /// assert_eq!(v, ~["abc", "def", "ghi"]); + /// + /// let v: ~[&str] = "lionXXtigerXleopard".split('X').collect(); + /// assert_eq!(v, ~["lion", "", "tiger", "leopard"]); + /// ``` + fn split(&self, sep: Sep) -> CharSplits<'a, Sep>; + + /// An iterator over substrings of `self`, separated by characters + /// matched by `sep`, restricted to splitting at most `count` + /// times. + /// + /// # Example + /// + /// ```rust + /// let v: ~[&str] = "Mary had a little lambda".splitn(' ', 2).collect(); + /// assert_eq!(v, ~["Mary", "had", "a little lambda"]); + /// + /// let v: ~[&str] = "abc1def2ghi".splitn(|c: char| c.is_digit(), 1).collect(); + /// assert_eq!(v, ~["abc", "def2ghi"]); + /// + /// let v: ~[&str] = "lionXXtigerXleopard".splitn('X', 2).collect(); + /// assert_eq!(v, ~["lion", "", "tigerXleopard"]); + /// ``` + fn splitn(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>; + + /// An iterator over substrings of `self`, separated by characters + /// matched by `sep`. + /// + /// Equivalent to `split`, except that the trailing substring + /// is skipped if empty (terminator semantics). + /// + /// # Example + /// + /// ```rust + /// let v: ~[&str] = "A.B.".split_terminator('.').collect(); + /// assert_eq!(v, ~["A", "B"]); + /// + /// let v: ~[&str] = "A..B..".split_terminator('.').collect(); + /// assert_eq!(v, ~["A", "", "B", ""]); + /// + /// let v: ~[&str] = "Mary had a little lamb".split(' ').rev().collect(); + /// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]); + /// + /// let v: ~[&str] = "abc1def2ghi".split(|c: char| c.is_digit()).rev().collect(); + /// assert_eq!(v, ~["ghi", "def", "abc"]); + /// + /// let v: ~[&str] = "lionXXtigerXleopard".split('X').rev().collect(); + /// assert_eq!(v, ~["leopard", "tiger", "", "lion"]); + /// ``` + fn split_terminator(&self, sep: Sep) -> CharSplits<'a, Sep>; + + /// Do not use this - it is deprecated. + #[deprecated = "replaced by .split(sep).rev()"] + fn rsplit(&self, sep: Sep) -> Rev>; + + /// An iterator over substrings of `self`, separated by characters + /// matched by `sep`, starting from the end of the string. + /// Restricted to splitting at most `count` times. + /// + /// # Example + /// + /// ```rust + /// let v: ~[&str] = "Mary had a little lamb".rsplitn(' ', 2).collect(); + /// assert_eq!(v, ~["lamb", "little", "Mary had a"]); + /// + /// let v: ~[&str] = "abc1def2ghi".rsplitn(|c: char| c.is_digit(), 1).collect(); + /// assert_eq!(v, ~["ghi", "abc1def"]); + /// + /// let v: ~[&str] = "lionXXtigerXleopard".rsplitn('X', 2).collect(); + /// assert_eq!(v, ~["leopard", "tiger", "lionX"]); + /// ``` + fn rsplitn(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>; + + /// An iterator over the start and end indices of the disjoint + /// matches of `sep` within `self`. + /// + /// That is, each returned value `(start, end)` satisfies + /// `self.slice(start, end) == sep`. For matches of `sep` within + /// `self` that overlap, only the indicies corresponding to the + /// first match are returned. + /// + /// # Example + /// + /// ```rust + /// let v: ~[(uint, uint)] = "abcXXXabcYYYabc".match_indices("abc").collect(); + /// assert_eq!(v, ~[(0,3), (6,9), (12,15)]); + /// + /// let v: ~[(uint, uint)] = "1abcabc2".match_indices("abc").collect(); + /// assert_eq!(v, ~[(1,4), (4,7)]); + /// + /// let v: ~[(uint, uint)] = "ababa".match_indices("aba").collect(); + /// assert_eq!(v, ~[(0, 3)]); // only the first `aba` + /// ``` + fn match_indices(&self, sep: &'a str) -> MatchIndices<'a>; + + /// An iterator over the substrings of `self` separated by `sep`. + /// + /// # Example + /// + /// ```rust + /// let v: ~[&str] = "abcXXXabcYYYabc".split_str("abc").collect(); + /// assert_eq!(v, ~["", "XXX", "YYY", ""]); + /// + /// let v: ~[&str] = "1abcabc2".split_str("abc").collect(); + /// assert_eq!(v, ~["1", "", "2"]); + /// ``` + fn split_str(&self, &'a str) -> StrSplits<'a>; + + /// An iterator over the lines of a string (subsequences separated + /// by `\n`). This does not include the empty string after a + /// trailing `\n`. + /// + /// # Example + /// + /// ```rust + /// let four_lines = "foo\nbar\n\nbaz\n"; + /// let v: ~[&str] = four_lines.lines().collect(); + /// assert_eq!(v, ~["foo", "bar", "", "baz"]); + /// ``` + fn lines(&self) -> CharSplits<'a, char>; + + /// An iterator over the lines of a string, separated by either + /// `\n` or `\r\n`. As with `.lines()`, this does not include an + /// empty trailing line. + /// + /// # Example + /// + /// ```rust + /// let four_lines = "foo\r\nbar\n\r\nbaz\n"; + /// let v: ~[&str] = four_lines.lines_any().collect(); + /// assert_eq!(v, ~["foo", "bar", "", "baz"]); + /// ``` + fn lines_any(&self) -> AnyLines<'a>; + + /// An iterator over the words of a string (subsequences separated + /// by any sequence of whitespace). Sequences of whitespace are + /// collapsed, so empty "words" are not included. + /// + /// # Example + /// + /// ```rust + /// let some_words = " Mary had\ta little \n\t lamb"; + /// let v: ~[&str] = some_words.words().collect(); + /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]); + /// ``` + fn words(&self) -> Words<'a>; + + /// Returns true if the string contains only whitespace. + /// + /// Whitespace characters are determined by `char::is_whitespace`. + /// + /// # Example + /// + /// ```rust + /// assert!(" \t\n".is_whitespace()); + /// assert!("".is_whitespace()); + /// + /// assert!( !"abc".is_whitespace()); + /// ``` + fn is_whitespace(&self) -> bool; + + /// Returns true if the string contains only alphanumeric code + /// points. + /// + /// Alphanumeric characters are determined by `char::is_alphanumeric`. + /// + /// # Example + /// + /// ```rust + /// assert!("Löwe老虎Léopard123".is_alphanumeric()); + /// assert!("".is_alphanumeric()); + /// + /// assert!( !" &*~".is_alphanumeric()); + /// ``` + fn is_alphanumeric(&self) -> bool; + + /// Returns the number of Unicode code points (`char`) that a + /// string holds. + /// + /// This does not perform any normalization, and is `O(n)`, since + /// UTF-8 is a variable width encoding of code points. + /// + /// *Warning*: The number of code points in a string does not directly + /// correspond to the number of visible characters or width of the + /// visible text due to composing characters, and double- and + /// zero-width ones. + /// + /// See also `.len()` for the byte length. + /// + /// # Example + /// + /// ```rust + /// // composed forms of `ö` and `é` + /// let c = "Löwe 老虎 Léopard"; // German, Simplified Chinese, French + /// // decomposed forms of `ö` and `é` + /// let d = "Lo\u0308we 老虎 Le\u0301opard"; + /// + /// assert_eq!(c.char_len(), 15); + /// assert_eq!(d.char_len(), 17); + /// + /// assert_eq!(c.len(), 21); + /// assert_eq!(d.len(), 23); + /// + /// // the two strings *look* the same + /// println!("{}", c); + /// println!("{}", d); + /// ``` + fn char_len(&self) -> uint; + + /// Returns a slice of the given string from the byte range + /// [`begin`..`end`). + /// + /// This operation is `O(1)`. + /// + /// Fails when `begin` and `end` do not point to valid characters + /// or point beyond the last character of the string. + /// + /// See also `slice_to` and `slice_from` for slicing prefixes and + /// suffixes of strings, and `slice_chars` for slicing based on + /// code point counts. + /// + /// # Example + /// + /// ```rust + /// let s = "Löwe 老虎 Léopard"; + /// assert_eq!(s.slice(0, 1), "L"); + /// + /// assert_eq!(s.slice(1, 9), "öwe 老"); + /// + /// // these will fail: + /// // byte 2 lies within `ö`: + /// // s.slice(2, 3); + /// + /// // byte 8 lies within `老` + /// // s.slice(1, 8); + /// + /// // byte 100 is outside the string + /// // s.slice(3, 100); + /// ``` + fn slice(&self, begin: uint, end: uint) -> &'a str; + + /// Returns a slice of the string from `begin` to its end. + /// + /// Equivalent to `self.slice(begin, self.len())`. + /// + /// Fails when `begin` does not point to a valid character, or is + /// out of bounds. + /// + /// See also `slice`, `slice_to` and `slice_chars`. + fn slice_from(&self, begin: uint) -> &'a str; + + /// Returns a slice of the string from the beginning to byte + /// `end`. + /// + /// Equivalent to `self.slice(0, end)`. + /// + /// Fails when `end` does not point to a valid character, or is + /// out of bounds. + /// + /// See also `slice`, `slice_from` and `slice_chars`. + fn slice_to(&self, end: uint) -> &'a str; + + /// Returns a slice of the string from the character range + /// [`begin`..`end`). + /// + /// That is, start at the `begin`-th code point of the string and + /// continue to the `end`-th code point. This does not detect or + /// handle edge cases such as leaving a combining character as the + /// first code point of the string. + /// + /// Due to the design of UTF-8, this operation is `O(end)`. + /// See `slice`, `slice_to` and `slice_from` for `O(1)` + /// variants that use byte indices rather than code point + /// indices. + /// + /// Fails if `begin` > `end` or the either `begin` or `end` are + /// beyond the last character of the string. + /// + /// # Example + /// + /// ```rust + /// let s = "Löwe 老虎 Léopard"; + /// assert_eq!(s.slice_chars(0, 4), "Löwe"); + /// assert_eq!(s.slice_chars(5, 7), "老虎"); + /// ``` + fn slice_chars(&self, begin: uint, end: uint) -> &'a str; + + /// Returns true if `needle` is a prefix of the string. + fn starts_with(&self, needle: &str) -> bool; + + /// Returns true if `needle` is a suffix of the string. + fn ends_with(&self, needle: &str) -> bool; + + /// Returns a string with leading and trailing whitespace removed. + fn trim(&self) -> &'a str; + + /// Returns a string with leading whitespace removed. + fn trim_left(&self) -> &'a str; + + /// Returns a string with trailing whitespace removed. + fn trim_right(&self) -> &'a str; + + /// Returns a string with characters that match `to_trim` removed. + /// + /// # Arguments + /// + /// * to_trim - a character matcher + /// + /// # Example + /// + /// ```rust + /// assert_eq!("11foo1bar11".trim_chars('1'), "foo1bar") + /// assert_eq!("12foo1bar12".trim_chars(&['1', '2']), "foo1bar") + /// assert_eq!("123foo1bar123".trim_chars(|c: char| c.is_digit()), "foo1bar") + /// ``` + fn trim_chars(&self, to_trim: C) -> &'a str; + + /// Returns a string with leading `chars_to_trim` removed. + /// + /// # Arguments + /// + /// * to_trim - a character matcher + /// + /// # Example + /// + /// ```rust + /// assert_eq!("11foo1bar11".trim_left_chars('1'), "foo1bar11") + /// assert_eq!("12foo1bar12".trim_left_chars(&['1', '2']), "foo1bar12") + /// assert_eq!("123foo1bar123".trim_left_chars(|c: char| c.is_digit()), "foo1bar123") + /// ``` + fn trim_left_chars(&self, to_trim: C) -> &'a str; + + /// Returns a string with trailing `chars_to_trim` removed. + /// + /// # Arguments + /// + /// * to_trim - a character matcher + /// + /// # Example + /// + /// ```rust + /// assert_eq!("11foo1bar11".trim_right_chars('1'), "11foo1bar") + /// assert_eq!("12foo1bar12".trim_right_chars(&['1', '2']), "12foo1bar") + /// assert_eq!("123foo1bar123".trim_right_chars(|c: char| c.is_digit()), "123foo1bar") + /// ``` + fn trim_right_chars(&self, to_trim: C) -> &'a str; + + /// Check that `index`-th byte lies at the start and/or end of a + /// UTF-8 code point sequence. + /// + /// The start and end of the string (when `index == self.len()`) + /// are considered to be boundaries. + /// + /// Fails if `index` is greater than `self.len()`. + /// + /// # Example + /// + /// ```rust + /// let s = "Löwe 老虎 Léopard"; + /// assert!(s.is_char_boundary(0)); + /// // start of `老` + /// assert!(s.is_char_boundary(6)); + /// assert!(s.is_char_boundary(s.len())); + /// + /// // second byte of `ö` + /// assert!(!s.is_char_boundary(2)); + /// + /// // third byte of `老` + /// assert!(!s.is_char_boundary(8)); + /// ``` + fn is_char_boundary(&self, index: uint) -> bool; + + /// Pluck a character out of a string and return the index of the next + /// character. + /// + /// This function can be used to iterate over the unicode characters of a + /// string. + /// + /// # Example + /// + /// This example manually iterate through the characters of a + /// string; this should normally by done by `.chars()` or + /// `.char_indices`. + /// + /// ```rust + /// use std::str::CharRange; + /// + /// let s = "中华Việt Nam"; + /// let mut i = 0u; + /// while i < s.len() { + /// let CharRange {ch, next} = s.char_range_at(i); + /// println!("{}: {}", i, ch); + /// i = next; + /// } + /// ``` + /// + /// ## Output + /// + /// ```ignore + /// 0: 中 + /// 3: 华 + /// 6: V + /// 7: i + /// 8: ệ + /// 11: t + /// 12: + /// 13: N + /// 14: a + /// 15: m + /// ``` + /// + /// # Arguments + /// + /// * s - The string + /// * i - The byte offset of the char to extract + /// + /// # Return value + /// + /// A record {ch: char, next: uint} containing the char value and the byte + /// index of the next unicode character. + /// + /// # Failure + /// + /// If `i` is greater than or equal to the length of the string. + /// If `i` is not the index of the beginning of a valid UTF-8 character. + fn char_range_at(&self, start: uint) -> CharRange; + + /// Given a byte position and a str, return the previous char and its position. + /// + /// This function can be used to iterate over a unicode string in reverse. + /// + /// Returns 0 for next index if called on start index 0. + fn char_range_at_reverse(&self, start: uint) -> CharRange; + + /// Plucks the character starting at the `i`th byte of a string + fn char_at(&self, i: uint) -> char; + + /// Plucks the character ending at the `i`th byte of a string + fn char_at_reverse(&self, i: uint) -> char; + + /// Work with the byte buffer of a string as a byte slice. + fn as_bytes(&self) -> &'a [u8]; + + /// Returns the byte index of the first character of `self` that + /// matches `search`. + /// + /// # Return value + /// + /// `Some` containing the byte index of the last matching character + /// or `None` if there is no match + /// + /// # Example + /// + /// ```rust + /// let s = "Löwe 老虎 Léopard"; + /// + /// assert_eq!(s.find('L'), Some(0)); + /// assert_eq!(s.find('é'), Some(14)); + /// + /// // the first space + /// assert_eq!(s.find(|c: char| c.is_whitespace()), Some(5)); + /// + /// // neither are found + /// assert_eq!(s.find(&['1', '2']), None); + /// ``` + fn find(&self, search: C) -> Option; + + /// Returns the byte index of the last character of `self` that + /// matches `search`. + /// + /// # Return value + /// + /// `Some` containing the byte index of the last matching character + /// or `None` if there is no match. + /// + /// # Example + /// + /// ```rust + /// let s = "Löwe 老虎 Léopard"; + /// + /// assert_eq!(s.rfind('L'), Some(13)); + /// assert_eq!(s.rfind('é'), Some(14)); + /// + /// // the second space + /// assert_eq!(s.rfind(|c: char| c.is_whitespace()), Some(12)); + /// + /// // searches for an occurrence of either `1` or `2`, but neither are found + /// assert_eq!(s.rfind(&['1', '2']), None); + /// ``` + fn rfind(&self, search: C) -> Option; + + /// Returns the byte index of the first matching substring + /// + /// # Arguments + /// + /// * `needle` - The string to search for + /// + /// # Return value + /// + /// `Some` containing the byte index of the first matching substring + /// or `None` if there is no match. + /// + /// # Example + /// + /// ```rust + /// let s = "Löwe 老虎 Léopard"; + /// + /// assert_eq!(s.find_str("老虎 L"), Some(6)); + /// assert_eq!(s.find_str("muffin man"), None); + /// ``` + fn find_str(&self, &str) -> Option; + + /// Retrieves the first character from a string slice and returns + /// it. This does not allocate a new string; instead, it returns a + /// slice that point one character beyond the character that was + /// shifted. If the string does not contain any characters, + /// a tuple of None and an empty string is returned instead. + /// + /// # Example + /// + /// ```rust + /// let s = "Löwe 老虎 Léopard"; + /// let (c, s1) = s.slice_shift_char(); + /// assert_eq!(c, Some('L')); + /// assert_eq!(s1, "öwe 老虎 Léopard"); + /// + /// let (c, s2) = s1.slice_shift_char(); + /// assert_eq!(c, Some('ö')); + /// assert_eq!(s2, "we 老虎 Léopard"); + /// ``` + fn slice_shift_char(&self) -> (Option, &'a str); + + /// Returns the byte offset of an inner slice relative to an enclosing outer slice. + /// + /// Fails if `inner` is not a direct slice contained within self. + /// + /// # Example + /// + /// ```rust + /// let string = "a\nb\nc"; + /// let lines: ~[&str] = string.lines().collect(); + /// + /// assert!(string.subslice_offset(lines[0]) == 0); // &"a" + /// assert!(string.subslice_offset(lines[1]) == 2); // &"b" + /// assert!(string.subslice_offset(lines[2]) == 4); // &"c" + /// ``` + fn subslice_offset(&self, inner: &str) -> uint; + + /// Return an unsafe pointer to the strings buffer. + /// + /// The caller must ensure that the string outlives this pointer, + /// and that it is not reallocated (e.g. by pushing to the + /// string). + fn as_ptr(&self) -> *u8; +} + +impl<'a> StrSlice<'a> for &'a str { + #[inline] + fn contains<'a>(&self, needle: &'a str) -> bool { + self.find_str(needle).is_some() + } + + #[inline] + fn contains_char(&self, needle: char) -> bool { + self.find(needle).is_some() + } + + #[inline] + fn chars(&self) -> Chars<'a> { + Chars{string: *self} + } + + #[inline] + #[deprecated = "replaced by .chars().rev()"] + fn chars_rev(&self) -> RevChars<'a> { + self.chars().rev() + } + + #[inline] + fn bytes(&self) -> Bytes<'a> { + self.as_bytes().iter().map(|&b| b) + } + + #[inline] + #[deprecated = "replaced by .bytes().rev()"] + fn bytes_rev(&self) -> RevBytes<'a> { + self.bytes().rev() + } + + #[inline] + fn char_indices(&self) -> CharOffsets<'a> { + CharOffsets{string: *self, iter: self.chars()} + } + + #[inline] + #[deprecated = "replaced by .char_indices().rev()"] + fn char_indices_rev(&self) -> RevCharOffsets<'a> { + self.char_indices().rev() + } + + #[inline] + fn split(&self, sep: Sep) -> CharSplits<'a, Sep> { + CharSplits { + string: *self, + only_ascii: sep.only_ascii(), + sep: sep, + allow_trailing_empty: true, + finished: false, + } + } + + #[inline] + fn splitn(&self, sep: Sep, count: uint) + -> CharSplitsN<'a, Sep> { + CharSplitsN { + iter: self.split(sep), + count: count, + invert: false, + } + } + + #[inline] + fn split_terminator(&self, sep: Sep) + -> CharSplits<'a, Sep> { + CharSplits { + allow_trailing_empty: false, + ..self.split(sep) + } + } + + #[inline] + #[deprecated = "replaced by .split(sep).rev()"] + fn rsplit(&self, sep: Sep) -> RevCharSplits<'a, Sep> { + self.split(sep).rev() + } + + #[inline] + fn rsplitn(&self, sep: Sep, count: uint) + -> CharSplitsN<'a, Sep> { + CharSplitsN { + iter: self.split(sep), + count: count, + invert: true, + } + } + + #[inline] + fn match_indices(&self, sep: &'a str) -> MatchIndices<'a> { + assert!(!sep.is_empty()) + MatchIndices { + haystack: *self, + needle: sep, + position: 0 + } + } + + #[inline] + fn split_str(&self, sep: &'a str) -> StrSplits<'a> { + StrSplits { + it: self.match_indices(sep), + last_end: 0, + finished: false + } + } + + #[inline] + fn lines(&self) -> CharSplits<'a, char> { + self.split_terminator('\n') + } + + fn lines_any(&self) -> AnyLines<'a> { + self.lines().map(|line| { + let l = line.len(); + if l > 0 && line[l - 1] == '\r' as u8 { line.slice(0, l - 1) } + else { line } + }) + } + + #[inline] + fn words(&self) -> Words<'a> { + self.split(char::is_whitespace).filter(|s| !s.is_empty()) + } + + #[inline] + fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) } + + #[inline] + fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) } + + #[inline] + fn char_len(&self) -> uint { self.chars().len() } + + #[inline] + fn slice(&self, begin: uint, end: uint) -> &'a str { + assert!(self.is_char_boundary(begin) && self.is_char_boundary(end)); + unsafe { raw::slice_bytes(*self, begin, end) } + } + + #[inline] + fn slice_from(&self, begin: uint) -> &'a str { + self.slice(begin, self.len()) + } + + #[inline] + fn slice_to(&self, end: uint) -> &'a str { + assert!(self.is_char_boundary(end)); + unsafe { raw::slice_bytes(*self, 0, end) } + } + + fn slice_chars(&self, begin: uint, end: uint) -> &'a str { + assert!(begin <= end); + let mut count = 0; + let mut begin_byte = None; + let mut end_byte = None; + + // This could be even more efficient by not decoding, + // only finding the char boundaries + for (idx, _) in self.char_indices() { + if count == begin { begin_byte = Some(idx); } + if count == end { end_byte = Some(idx); break; } + count += 1; + } + if begin_byte.is_none() && count == begin { begin_byte = Some(self.len()) } + if end_byte.is_none() && count == end { end_byte = Some(self.len()) } + + match (begin_byte, end_byte) { + (None, _) => fail!("slice_chars: `begin` is beyond end of string"), + (_, None) => fail!("slice_chars: `end` is beyond end of string"), + (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) } + } + } + + #[inline] + fn starts_with<'a>(&self, needle: &'a str) -> bool { + let n = needle.len(); + self.len() >= n && needle.as_bytes() == self.as_bytes().slice_to(n) + } + + #[inline] + fn ends_with(&self, needle: &str) -> bool { + let (m, n) = (self.len(), needle.len()); + m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n) + } + + #[inline] + fn trim(&self) -> &'a str { + self.trim_left().trim_right() + } + + #[inline] + fn trim_left(&self) -> &'a str { + self.trim_left_chars(char::is_whitespace) + } + + #[inline] + fn trim_right(&self) -> &'a str { + self.trim_right_chars(char::is_whitespace) + } + + #[inline] + fn trim_chars(&self, mut to_trim: C) -> &'a str { + let cur = match self.find(|c: char| !to_trim.matches(c)) { + None => "", + Some(i) => unsafe { raw::slice_bytes(*self, i, self.len()) } + }; + match cur.rfind(|c: char| !to_trim.matches(c)) { + None => "", + Some(i) => { + let right = cur.char_range_at(i).next; + unsafe { raw::slice_bytes(cur, 0, right) } + } + } + } + + #[inline] + fn trim_left_chars(&self, mut to_trim: C) -> &'a str { + match self.find(|c: char| !to_trim.matches(c)) { + None => "", + Some(first) => unsafe { raw::slice_bytes(*self, first, self.len()) } + } + } + + #[inline] + fn trim_right_chars(&self, mut to_trim: C) -> &'a str { + match self.rfind(|c: char| !to_trim.matches(c)) { + None => "", + Some(last) => { + let next = self.char_range_at(last).next; + unsafe { raw::slice_bytes(*self, 0u, next) } + } + } + } + + #[inline] + fn is_char_boundary(&self, index: uint) -> bool { + if index == self.len() { return true; } + let b = self[index]; + return b < 128u8 || b >= 192u8; + } + + #[inline] + fn char_range_at(&self, i: uint) -> CharRange { + if self[i] < 128u8 { + return CharRange {ch: self[i] as char, next: i + 1 }; + } + + // Multibyte case is a fn to allow char_range_at to inline cleanly + fn multibyte_char_range_at(s: &str, i: uint) -> CharRange { + let mut val = s[i] as u32; + let w = UTF8_CHAR_WIDTH[val as uint] as uint; + assert!((w != 0)); + + val = utf8_first_byte!(val, w); + val = utf8_acc_cont_byte!(val, s[i + 1]); + if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); } + if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); } + + return CharRange {ch: unsafe { transmute(val) }, next: i + w}; + } + + return multibyte_char_range_at(*self, i); + } + + #[inline] + fn char_range_at_reverse(&self, start: uint) -> CharRange { + let mut prev = start; + + prev = prev.saturating_sub(1); + if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} } + + // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly + fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange { + // while there is a previous byte == 10...... + while i > 0 && s[i] & 192u8 == TAG_CONT_U8 { + i -= 1u; + } + + let mut val = s[i] as u32; + let w = UTF8_CHAR_WIDTH[val as uint] as uint; + assert!((w != 0)); + + val = utf8_first_byte!(val, w); + val = utf8_acc_cont_byte!(val, s[i + 1]); + if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); } + if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); } + + return CharRange {ch: unsafe { transmute(val) }, next: i}; + } + + return multibyte_char_range_at_reverse(*self, prev); + } + + #[inline] + fn char_at(&self, i: uint) -> char { + self.char_range_at(i).ch + } + + #[inline] + fn char_at_reverse(&self, i: uint) -> char { + self.char_range_at_reverse(i).ch + } + + #[inline] + fn as_bytes(&self) -> &'a [u8] { + unsafe { cast::transmute(*self) } + } + + fn find(&self, mut search: C) -> Option { + if search.only_ascii() { + self.bytes().position(|b| search.matches(b as char)) + } else { + for (index, c) in self.char_indices() { + if search.matches(c) { return Some(index); } + } + None + } + } + + fn rfind(&self, mut search: C) -> Option { + if search.only_ascii() { + self.bytes().rposition(|b| search.matches(b as char)) + } else { + for (index, c) in self.char_indices().rev() { + if search.matches(c) { return Some(index); } + } + None + } + } + + fn find_str(&self, needle: &str) -> Option { + if needle.is_empty() { + Some(0) + } else { + self.match_indices(needle) + .next() + .map(|(start, _end)| start) + } + } + + #[inline] + fn slice_shift_char(&self) -> (Option, &'a str) { + if self.is_empty() { + return (None, *self); + } else { + let CharRange {ch, next} = self.char_range_at(0u); + let next_s = unsafe { raw::slice_bytes(*self, next, self.len()) }; + return (Some(ch), next_s); + } + } + + fn subslice_offset(&self, inner: &str) -> uint { + let a_start = self.as_ptr() as uint; + let a_end = a_start + self.len(); + let b_start = inner.as_ptr() as uint; + let b_end = b_start + inner.len(); + + assert!(a_start <= b_start); + assert!(b_end <= a_end); + b_start - a_start + } + + #[inline] + fn as_ptr(&self) -> *u8 { + self.repr().data + } +} + +impl<'a> Default for &'a str { + fn default() -> &'a str { "" } +} diff --git a/src/libstd/fmt/mod.rs b/src/libstd/fmt/mod.rs index c80177b4644..9f683e45678 100644 --- a/src/libstd/fmt/mod.rs +++ b/src/libstd/fmt/mod.rs @@ -493,11 +493,11 @@ use io; use iter; use iter::{Iterator, range}; use num::Signed; -use option::{Option,Some,None}; +use option::{Option, Some, None}; use owned::Box; use repr; -use result::{Ok, Err}; -use str::StrSlice; +use result::{Ok, Err, ResultUnwrap}; +use str::{StrSlice, StrAllocating, UTF16Item, ScalarValue, LoneSurrogate}; use str; use slice::{Vector, ImmutableVector}; use slice; @@ -1359,5 +1359,20 @@ impl Show for cmp::Ordering { } } +impl Show for Cell { + fn fmt(&self, f: &mut Formatter) -> Result { + write!(f.buf, r"Cell \{ value: {} \}", self.get()) + } +} + +impl Show for UTF16Item { + fn fmt(&self, f: &mut Formatter) -> Result { + match *self { + ScalarValue(c) => write!(f.buf, "ScalarValue({})", c), + LoneSurrogate(u) => write!(f.buf, "LoneSurrogate({})", u), + } + } +} + // If you expected tests to be here, look instead at the run-pass/ifmt.rs test, // it's a lot easier than creating all of the rt::Piece structures here. diff --git a/src/libstd/fmt/num.rs b/src/libstd/fmt/num.rs index 2032a2a6b58..12adcee2f0f 100644 --- a/src/libstd/fmt/num.rs +++ b/src/libstd/fmt/num.rs @@ -194,7 +194,7 @@ mod tests { use fmt::radix; use super::{Binary, Octal, Decimal, LowerHex, UpperHex}; use super::{GenericRadix, Radix}; - use str::StrSlice; + use str::StrAllocating; #[test] fn test_radix_base() { diff --git a/src/libstd/hash/sip.rs b/src/libstd/hash/sip.rs index 3c1d5897e38..58e0f4c717d 100644 --- a/src/libstd/hash/sip.rs +++ b/src/libstd/hash/sip.rs @@ -362,7 +362,7 @@ mod tests { use prelude::*; use num::ToStrRadix; use option::{Some, None}; - use str::{Str,StrSlice}; + use str::Str; use strbuf::StrBuf; use slice::{Vector, ImmutableVector}; use self::test::Bencher; diff --git a/src/libstd/io/mod.rs b/src/libstd/io/mod.rs index 59a8c6f3439..cd069ddc1ea 100644 --- a/src/libstd/io/mod.rs +++ b/src/libstd/io/mod.rs @@ -230,7 +230,7 @@ use option::{Option, Some, None}; use owned::Box; use path::Path; use result::{Ok, Err, Result}; -use str::StrSlice; +use str::{StrSlice, StrAllocating}; use str; use uint; use unstable::finally::try_finally; diff --git a/src/libstd/io/process.rs b/src/libstd/io/process.rs index 1471a049bf3..74f6944f102 100644 --- a/src/libstd/io/process.rs +++ b/src/libstd/io/process.rs @@ -428,7 +428,6 @@ impl Drop for Process { mod tests { use io::process::{ProcessConfig, Process}; use prelude::*; - use str::StrSlice; // FIXME(#10380) these tests should not all be ignored on android. diff --git a/src/libstd/local_data.rs b/src/libstd/local_data.rs index 9a029de8d6c..da738a387b2 100644 --- a/src/libstd/local_data.rs +++ b/src/libstd/local_data.rs @@ -357,7 +357,6 @@ mod tests { use super::*; use owned::Box; use task; - use str::StrSlice; #[test] fn test_tls_multitask() { diff --git a/src/libstd/os.rs b/src/libstd/os.rs index 071aae974db..47d316f75ab 100644 --- a/src/libstd/os.rs +++ b/src/libstd/os.rs @@ -38,7 +38,7 @@ use ops::Drop; use result::{Err, Ok, Result}; use ptr; use str; -use str::{Str, StrSlice}; +use str::{Str, StrSlice, StrAllocating}; use fmt; use sync::atomics::{AtomicInt, INIT_ATOMIC_INT, SeqCst}; use path::{Path, GenericPath}; diff --git a/src/libstd/path/windows.rs b/src/libstd/path/windows.rs index 758a76167cd..f21fbe1b6e6 100644 --- a/src/libstd/path/windows.rs +++ b/src/libstd/path/windows.rs @@ -21,7 +21,7 @@ use io::Writer; use iter::{AdditiveIterator, DoubleEndedIterator, Extendable, Rev, Iterator, Map}; use option::{Option, Some, None}; use slice::{Vector, OwnedVector, ImmutableVector}; -use str::{CharSplits, Str, StrVector, StrSlice}; +use str::{CharSplits, Str, StrAllocating, StrVector, StrSlice}; use strbuf::StrBuf; use vec::Vec; @@ -684,7 +684,7 @@ impl Path { } } - fn normalize_(s: S) -> (Option, StrBuf) { + fn normalize_(s: S) -> (Option, StrBuf) { // make borrowck happy let (prefix, val) = { let prefix = parse_prefix(s.as_slice()); @@ -842,7 +842,7 @@ impl Path { } fn update_normalized(&mut self, s: S) { - let (prefix, path) = Path::normalize_(s); + let (prefix, path) = Path::normalize_(s.as_slice()); self.repr = path; self.prefix = prefix; self.update_sepidx(); diff --git a/src/libstd/prelude.rs b/src/libstd/prelude.rs index d4440e4a7da..a8e26845c1b 100644 --- a/src/libstd/prelude.rs +++ b/src/libstd/prelude.rs @@ -70,6 +70,7 @@ pub use path::{GenericPath, Path, PosixPath, WindowsPath}; pub use ptr::RawPtr; pub use io::{Buffer, Writer, Reader, Seek}; pub use str::{Str, StrVector, StrSlice, OwnedStr, IntoMaybeOwned}; +pub use str::{StrAllocating}; pub use to_str::{ToStr, IntoStr}; pub use tuple::{Tuple1, Tuple2, Tuple3, Tuple4}; pub use tuple::{Tuple5, Tuple6, Tuple7, Tuple8}; diff --git a/src/libstd/repr.rs b/src/libstd/repr.rs index 79b927c8d77..380951772ae 100644 --- a/src/libstd/repr.rs +++ b/src/libstd/repr.rs @@ -606,6 +606,7 @@ pub fn write_repr(writer: &mut io::Writer, object: &T) -> io::IoResult<()> { pub fn repr_to_str(t: &T) -> ~str { use str; + use str::StrAllocating; use io; let mut result = io::MemWriter::new(); diff --git a/src/libstd/rt/args.rs b/src/libstd/rt/args.rs index 17e6f6b7698..ac1692e6bb3 100644 --- a/src/libstd/rt/args.rs +++ b/src/libstd/rt/args.rs @@ -70,7 +70,6 @@ mod imp { use owned::Box; use unstable::mutex::{StaticNativeMutex, NATIVE_MUTEX_INIT}; use mem; - #[cfg(not(test))] use str::StrSlice; #[cfg(not(test))] use ptr::RawPtr; static mut global_args_ptr: uint = 0; diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 54ddf60ed7d..666c0a58b33 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -83,22 +83,24 @@ use cmp::{Eq, TotalEq, Ord, TotalOrd, Equiv, Ordering}; use container::Container; use fmt; use io::Writer; -use iter::{Iterator, FromIterator, Extendable, range}; -use iter::{Filter, AdditiveIterator, Map}; -use iter::{Rev, DoubleEndedIterator, ExactSize}; -use libc; -use num::Saturating; +use iter::{Iterator, range, AdditiveIterator}; use option::{None, Option, Some}; use ptr; use from_str::FromStr; -use slice; use slice::{OwnedVector, ImmutableVector, MutableVector}; use slice::{Vector}; use vec::Vec; use default::Default; -use raw::Repr; use strbuf::StrBuf; +pub use core::str::{from_utf8, CharEq, Chars, CharOffsets, RevChars}; +pub use core::str::{RevCharOffsets, Bytes, RevBytes, CharSplits, RevCharSplits}; +pub use core::str::{CharSplitsN, Words, AnyLines, MatchIndices, StrSplits}; +pub use core::str::{eq_slice, eq, is_utf8, is_utf16, UTF16Items}; +pub use core::str::{UTF16Item, ScalarValue, LoneSurrogate, utf16_items}; +pub use core::str::{truncate_utf16_at_nul, utf8_char_width, CharRange}; +pub use core::str::{Str, StrSlice}; + /* Section: Creating a string */ @@ -113,18 +115,6 @@ pub fn from_utf8_owned(vv: ~[u8]) -> Option<~str> { } } -/// Converts a vector to a string slice without performing any allocations. -/// -/// Once the slice has been validated as utf-8, it is transmuted in-place and -/// returned as a '&str' instead of a '&[u8]' -/// -/// Returns None if the slice is not utf-8. -pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a str> { - if is_utf8(v) { - Some(unsafe { raw::from_utf8(v) }) - } else { None } -} - impl FromStr for ~str { #[inline] fn from_str(s: &str) -> Option<~str> { Some(s.to_owned()) } @@ -214,348 +204,10 @@ impl<'a, S: Str> StrVector for Vec { } } -/// Something that can be used to compare against a character -pub trait CharEq { - /// Determine if the splitter should split at the given character - fn matches(&mut self, char) -> bool; - /// Indicate if this is only concerned about ASCII characters, - /// which can allow for a faster implementation. - fn only_ascii(&self) -> bool; -} - -impl CharEq for char { - #[inline] - fn matches(&mut self, c: char) -> bool { *self == c } - - #[inline] - fn only_ascii(&self) -> bool { (*self as uint) < 128 } -} - -impl<'a> CharEq for |char|: 'a -> bool { - #[inline] - fn matches(&mut self, c: char) -> bool { (*self)(c) } - - #[inline] - fn only_ascii(&self) -> bool { false } -} - -impl CharEq for extern "Rust" fn(char) -> bool { - #[inline] - fn matches(&mut self, c: char) -> bool { (*self)(c) } - - #[inline] - fn only_ascii(&self) -> bool { false } -} - -impl<'a> CharEq for &'a [char] { - #[inline] - fn matches(&mut self, c: char) -> bool { - self.iter().any(|&mut m| m.matches(c)) - } - - #[inline] - fn only_ascii(&self) -> bool { - self.iter().all(|m| m.only_ascii()) - } -} - /* Section: Iterators */ -/// External iterator for a string's characters. -/// Use with the `std::iter` module. -#[deriving(Clone)] -pub struct Chars<'a> { - /// The slice remaining to be iterated - string: &'a str, -} - -impl<'a> Iterator for Chars<'a> { - #[inline] - fn next(&mut self) -> Option { - // Decode the next codepoint, then update - // the slice to be just the remaining part - if self.string.len() != 0 { - let CharRange {ch, next} = self.string.char_range_at(0); - unsafe { - self.string = raw::slice_unchecked(self.string, next, self.string.len()); - } - Some(ch) - } else { - None - } - } - - #[inline] - fn size_hint(&self) -> (uint, Option) { - (self.string.len().saturating_add(3)/4, Some(self.string.len())) - } -} - -impl<'a> DoubleEndedIterator for Chars<'a> { - #[inline] - fn next_back(&mut self) -> Option { - if self.string.len() != 0 { - let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len()); - unsafe { - self.string = raw::slice_unchecked(self.string, 0, next); - } - Some(ch) - } else { - None - } - } -} - -/// External iterator for a string's characters and their byte offsets. -/// Use with the `std::iter` module. -#[deriving(Clone)] -pub struct CharOffsets<'a> { - /// The original string to be iterated - string: &'a str, - iter: Chars<'a>, -} - -impl<'a> Iterator<(uint, char)> for CharOffsets<'a> { - #[inline] - fn next(&mut self) -> Option<(uint, char)> { - // Compute the byte offset by using the pointer offset between - // the original string slice and the iterator's remaining part - let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint; - self.iter.next().map(|ch| (offset, ch)) - } - - #[inline] - fn size_hint(&self) -> (uint, Option) { - self.iter.size_hint() - } -} - -impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> { - #[inline] - fn next_back(&mut self) -> Option<(uint, char)> { - self.iter.next_back().map(|ch| { - let offset = self.iter.string.len() + - self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint; - (offset, ch) - }) - } -} - -#[deprecated = "replaced by Rev>"] -pub type RevChars<'a> = Rev>; - -#[deprecated = "replaced by Rev>"] -pub type RevCharOffsets<'a> = Rev>; - -/// External iterator for a string's bytes. -/// Use with the `std::iter` module. -pub type Bytes<'a> = - Map<'a, &'a u8, u8, slice::Items<'a, u8>>; - -#[deprecated = "replaced by Rev>"] -pub type RevBytes<'a> = Rev>; - -/// An iterator over the substrings of a string, separated by `sep`. -#[deriving(Clone)] -pub struct CharSplits<'a, Sep> { - /// The slice remaining to be iterated - string: &'a str, - sep: Sep, - /// Whether an empty string at the end is allowed - allow_trailing_empty: bool, - only_ascii: bool, - finished: bool, -} - -#[deprecated = "replaced by Rev>"] -pub type RevCharSplits<'a, Sep> = Rev>; - -/// An iterator over the substrings of a string, separated by `sep`, -/// splitting at most `count` times. -#[deriving(Clone)] -pub struct CharSplitsN<'a, Sep> { - iter: CharSplits<'a, Sep>, - /// The number of splits remaining - count: uint, - invert: bool, -} - -/// An iterator over the words of a string, separated by a sequence of whitespace -pub type Words<'a> = - Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>; - -/// An iterator over the lines of a string, separated by either `\n` or (`\r\n`). -pub type AnyLines<'a> = - Map<'a, &'a str, &'a str, CharSplits<'a, char>>; - -impl<'a, Sep> CharSplits<'a, Sep> { - #[inline] - fn get_end(&mut self) -> Option<&'a str> { - if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) { - self.finished = true; - Some(self.string) - } else { - None - } - } -} - -impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplits<'a, Sep> { - #[inline] - fn next(&mut self) -> Option<&'a str> { - if self.finished { return None } - - let mut next_split = None; - if self.only_ascii { - for (idx, byte) in self.string.bytes().enumerate() { - if self.sep.matches(byte as char) && byte < 128u8 { - next_split = Some((idx, idx + 1)); - break; - } - } - } else { - for (idx, ch) in self.string.char_indices() { - if self.sep.matches(ch) { - next_split = Some((idx, self.string.char_range_at(idx).next)); - break; - } - } - } - match next_split { - Some((a, b)) => unsafe { - let elt = raw::slice_unchecked(self.string, 0, a); - self.string = raw::slice_unchecked(self.string, b, self.string.len()); - Some(elt) - }, - None => self.get_end(), - } - } -} - -impl<'a, Sep: CharEq> DoubleEndedIterator<&'a str> -for CharSplits<'a, Sep> { - #[inline] - fn next_back(&mut self) -> Option<&'a str> { - if self.finished { return None } - - if !self.allow_trailing_empty { - self.allow_trailing_empty = true; - match self.next_back() { - Some(elt) if !elt.is_empty() => return Some(elt), - _ => if self.finished { return None } - } - } - let len = self.string.len(); - let mut next_split = None; - - if self.only_ascii { - for (idx, byte) in self.string.bytes().enumerate().rev() { - if self.sep.matches(byte as char) && byte < 128u8 { - next_split = Some((idx, idx + 1)); - break; - } - } - } else { - for (idx, ch) in self.string.char_indices().rev() { - if self.sep.matches(ch) { - next_split = Some((idx, self.string.char_range_at(idx).next)); - break; - } - } - } - match next_split { - Some((a, b)) => unsafe { - let elt = raw::slice_unchecked(self.string, b, len); - self.string = raw::slice_unchecked(self.string, 0, a); - Some(elt) - }, - None => { self.finished = true; Some(self.string) } - } - } -} - -impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> { - #[inline] - fn next(&mut self) -> Option<&'a str> { - if self.count != 0 { - self.count -= 1; - if self.invert { self.iter.next_back() } else { self.iter.next() } - } else { - self.iter.get_end() - } - } -} - -/// An iterator over the start and end indices of the matches of a -/// substring within a larger string -#[deriving(Clone)] -pub struct MatchIndices<'a> { - haystack: &'a str, - needle: &'a str, - position: uint, -} - -/// An iterator over the substrings of a string separated by a given -/// search string -#[deriving(Clone)] -pub struct StrSplits<'a> { - it: MatchIndices<'a>, - last_end: uint, - finished: bool -} - -impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> { - #[inline] - fn next(&mut self) -> Option<(uint, uint)> { - // See Issue #1932 for why this is a naive search - let (h_len, n_len) = (self.haystack.len(), self.needle.len()); - let mut match_start = 0; - let mut match_i = 0; - - while self.position < h_len { - if self.haystack[self.position] == self.needle[match_i] { - if match_i == 0 { match_start = self.position; } - match_i += 1; - self.position += 1; - - if match_i == n_len { - // found a match! - return Some((match_start, self.position)); - } - } else { - // failed match, backtrack - if match_i > 0 { - match_i = 0; - self.position = match_start; - } - self.position += 1; - } - } - None - } -} - -impl<'a> Iterator<&'a str> for StrSplits<'a> { - #[inline] - fn next(&mut self) -> Option<&'a str> { - if self.finished { return None; } - - match self.it.next() { - Some((from, to)) => { - let ret = Some(self.it.haystack.slice(self.last_end, from)); - self.last_end = to; - ret - } - None => { - self.finished = true; - Some(self.it.haystack.slice(self.last_end, self.it.haystack.len())) - } - } - } -} - // Helper functions used for Unicode normalization fn canonical_sort(comb: &mut [(char, u8)]) { use iter::range; @@ -675,294 +327,10 @@ pub fn replace(s: &str, from: &str, to: &str) -> ~str { result.into_owned() } -/* -Section: Comparing strings -*/ - -// share the implementation of the lang-item vs. non-lang-item -// eq_slice. -#[inline] -fn eq_slice_(a: &str, b: &str) -> bool { - a.len() == b.len() && unsafe { - libc::memcmp(a.as_ptr() as *libc::c_void, - b.as_ptr() as *libc::c_void, - a.len() as libc::size_t) == 0 - } -} - -/// Bytewise slice equality -#[cfg(not(test))] -#[lang="str_eq"] -#[inline] -pub fn eq_slice(a: &str, b: &str) -> bool { - eq_slice_(a, b) -} - -/// Bytewise slice equality -#[cfg(test)] -#[inline] -pub fn eq_slice(a: &str, b: &str) -> bool { - eq_slice_(a, b) -} - -/// Bytewise string equality -#[cfg(not(test))] -#[lang="uniq_str_eq"] -#[inline] -pub fn eq(a: &~str, b: &~str) -> bool { - eq_slice(*a, *b) -} - -#[cfg(test)] -#[inline] -pub fn eq(a: &~str, b: &~str) -> bool { - eq_slice(*a, *b) -} - /* Section: Misc */ -/// Walk through `iter` checking that it's a valid UTF-8 sequence, -/// returning `true` in that case, or, if it is invalid, `false` with -/// `iter` reset such that it is pointing at the first byte in the -/// invalid sequence. -#[inline(always)] -fn run_utf8_validation_iterator(iter: &mut slice::Items) -> bool { - loop { - // save the current thing we're pointing at. - let old = *iter; - - // restore the iterator we had at the start of this codepoint. - macro_rules! err ( () => { {*iter = old; return false} }); - macro_rules! next ( () => { - match iter.next() { - Some(a) => *a, - // we needed data, but there was none: error! - None => err!() - } - }); - - let first = match iter.next() { - Some(&b) => b, - // we're at the end of the iterator and a codepoint - // boundary at the same time, so this string is valid. - None => return true - }; - - // ASCII characters are always valid, so only large - // bytes need more examination. - if first >= 128 { - let w = utf8_char_width(first); - let second = next!(); - // 2-byte encoding is for codepoints \u0080 to \u07ff - // first C2 80 last DF BF - // 3-byte encoding is for codepoints \u0800 to \uffff - // first E0 A0 80 last EF BF BF - // excluding surrogates codepoints \ud800 to \udfff - // ED A0 80 to ED BF BF - // 4-byte encoding is for codepoints \u10000 to \u10ffff - // first F0 90 80 80 last F4 8F BF BF - // - // Use the UTF-8 syntax from the RFC - // - // https://tools.ietf.org/html/rfc3629 - // UTF8-1 = %x00-7F - // UTF8-2 = %xC2-DF UTF8-tail - // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / - // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) - // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / - // %xF4 %x80-8F 2( UTF8-tail ) - match w { - 2 => if second & 192 != TAG_CONT_U8 {err!()}, - 3 => { - match (first, second, next!() & 192) { - (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) | - (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) | - (0xED , 0x80 .. 0x9F, TAG_CONT_U8) | - (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => {} - _ => err!() - } - } - 4 => { - match (first, second, next!() & 192, next!() & 192) { - (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) | - (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) | - (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {} - _ => err!() - } - } - _ => err!() - } - } - } -} - -/// Determines if a vector of bytes contains valid UTF-8. -pub fn is_utf8(v: &[u8]) -> bool { - run_utf8_validation_iterator(&mut v.iter()) -} - -#[inline(always)] -fn first_non_utf8_index(v: &[u8]) -> Option { - let mut it = v.iter(); - - let ok = run_utf8_validation_iterator(&mut it); - if ok { - None - } else { - // work out how many valid bytes we've consumed - // (run_utf8_validation_iterator resets the iterator to just - // after the last good byte), which we can do because the - // vector iterator size_hint is exact. - let (remaining, _) = it.size_hint(); - Some(v.len() - remaining) - } -} - -/// Determines if a vector of `u16` contains valid UTF-16 -pub fn is_utf16(v: &[u16]) -> bool { - let mut it = v.iter(); - macro_rules! next ( ($ret:expr) => { - match it.next() { Some(u) => *u, None => return $ret } - } - ) - loop { - let u = next!(true); - - match char::from_u32(u as u32) { - Some(_) => {} - None => { - let u2 = next!(false); - if u < 0xD7FF || u > 0xDBFF || - u2 < 0xDC00 || u2 > 0xDFFF { return false; } - } - } - } -} - -/// An iterator that decodes UTF-16 encoded codepoints from a vector -/// of `u16`s. -#[deriving(Clone)] -pub struct UTF16Items<'a> { - iter: slice::Items<'a, u16> -} -/// The possibilities for values decoded from a `u16` stream. -#[deriving(Eq, TotalEq, Clone, Show)] -pub enum UTF16Item { - /// A valid codepoint. - ScalarValue(char), - /// An invalid surrogate without its pair. - LoneSurrogate(u16) -} - -impl UTF16Item { - /// Convert `self` to a `char`, taking `LoneSurrogate`s to the - /// replacement character (U+FFFD). - #[inline] - pub fn to_char_lossy(&self) -> char { - match *self { - ScalarValue(c) => c, - LoneSurrogate(_) => '\uFFFD' - } - } -} - -impl<'a> Iterator for UTF16Items<'a> { - fn next(&mut self) -> Option { - let u = match self.iter.next() { - Some(u) => *u, - None => return None - }; - - if u < 0xD800 || 0xDFFF < u { - // not a surrogate - Some(ScalarValue(unsafe {cast::transmute(u as u32)})) - } else if u >= 0xDC00 { - // a trailing surrogate - Some(LoneSurrogate(u)) - } else { - // preserve state for rewinding. - let old = self.iter; - - let u2 = match self.iter.next() { - Some(u2) => *u2, - // eof - None => return Some(LoneSurrogate(u)) - }; - if u2 < 0xDC00 || u2 > 0xDFFF { - // not a trailing surrogate so we're not a valid - // surrogate pair, so rewind to redecode u2 next time. - self.iter = old; - return Some(LoneSurrogate(u)) - } - - // all ok, so lets decode it. - let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000; - Some(ScalarValue(unsafe {cast::transmute(c)})) - } - } - - #[inline] - fn size_hint(&self) -> (uint, Option) { - let (low, high) = self.iter.size_hint(); - // we could be entirely valid surrogates (2 elements per - // char), or entirely non-surrogates (1 element per char) - (low / 2, high) - } -} - -/// Create an iterator over the UTF-16 encoded codepoints in `v`, -/// returning invalid surrogates as `LoneSurrogate`s. -/// -/// # Example -/// -/// ```rust -/// use std::str; -/// use std::str::{ScalarValue, LoneSurrogate}; -/// -/// // 𝄞music -/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, -/// 0x0073, 0xDD1E, 0x0069, 0x0063, -/// 0xD834]; -/// -/// assert_eq!(str::utf16_items(v).collect::<~[_]>(), -/// ~[ScalarValue('𝄞'), -/// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'), -/// LoneSurrogate(0xDD1E), -/// ScalarValue('i'), ScalarValue('c'), -/// LoneSurrogate(0xD834)]); -/// ``` -pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> { - UTF16Items { iter : v.iter() } -} - -/// Return a slice of `v` ending at (and not including) the first NUL -/// (0). -/// -/// # Example -/// -/// ```rust -/// use std::str; -/// -/// // "abcd" -/// let mut v = ['a' as u16, 'b' as u16, 'c' as u16, 'd' as u16]; -/// // no NULs so no change -/// assert_eq!(str::truncate_utf16_at_nul(v), v.as_slice()); -/// -/// // "ab\0d" -/// v[2] = 0; -/// assert_eq!(str::truncate_utf16_at_nul(v), -/// &['a' as u16, 'b' as u16]); -/// ``` -pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] { - match v.iter().position(|c| *c == 0) { - // don't include the 0 - Some(i) => v.slice_to(i), - None => v - } -} - /// Decode a UTF-16 encoded vector `v` into a string, returning `None` /// if `v` contains any invalid data. /// @@ -1010,42 +378,6 @@ pub fn from_utf16_lossy(v: &[u16]) -> ~str { utf16_items(v).map(|c| c.to_char_lossy()).collect() } -// https://tools.ietf.org/html/rfc3629 -static UTF8_CHAR_WIDTH: [u8, ..256] = [ -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF -0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, -2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF -4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF -]; - -/// Given a first byte, determine how many bytes are in this UTF-8 character -#[inline] -pub fn utf8_char_width(b: u8) -> uint { - return UTF8_CHAR_WIDTH[b as uint] as uint; -} - -/// Struct that contains a `char` and the index of the first byte of -/// the next `char` in a string. This can be used as a data structure -/// for iterating over the UTF-8 bytes of a string. -pub struct CharRange { - /// Current `char` - pub ch: char, - /// Index of the first byte of the next `char` - pub next: uint, -} - // Return the initial codepoint accumulator for the first byte. // The first byte is special, only want bottom 5 bits for width 2, 4 bits // for width 3, and 3 bits for width 4 @@ -1071,13 +403,12 @@ static TAG_CONT_U8: u8 = 128u8; /// assert_eq!(output.as_slice(), "Hello \uFFFDWorld"); /// ``` pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> { - let firstbad = match first_non_utf8_index(v) { - None => return Slice(unsafe { cast::transmute(v) }), - Some(i) => i - }; + if is_utf8(v) { + return Slice(unsafe { cast::transmute(v) }) + } static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8 - let mut i = firstbad; + let mut i = 0; let total = v.len(); fn unsafe_get(xs: &[u8], i: uint) -> u8 { unsafe { *xs.unsafe_ref(i) } @@ -1101,7 +432,7 @@ pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> { // subseqidx is the index of the first byte of the subsequence we're looking at. // It's used to copy a bunch of contiguous good codepoints at once instead of copying // them one by one. - let mut subseqidx = firstbad; + let mut subseqidx = 0; while i < total { let i_ = i; @@ -1282,7 +613,9 @@ impl<'a> Str for MaybeOwned<'a> { Owned(ref s) => s.as_slice() } } +} +impl<'a> StrAllocating for MaybeOwned<'a> { #[inline] fn into_owned(self) -> ~str { match self { @@ -1335,16 +668,17 @@ impl<'a> fmt::Show for MaybeOwned<'a> { /// Unsafe operations pub mod raw { use cast; - use container::Container; use iter::Iterator; use libc; use ptr::RawPtr; use ptr; - use raw::Slice; - use slice::{MutableVector, ImmutableVector, OwnedVector, Vector}; - use str::{is_utf8, StrSlice}; + use slice::{MutableVector, OwnedVector, Vector}; + use str::{is_utf8}; use vec::Vec; + pub use core::str::raw::{from_utf8, c_str_to_static_slice, slice_bytes}; + pub use core::str::raw::{slice_unchecked}; + /// Create a Rust string from a *u8 buffer of the given length pub unsafe fn from_buf_len(buf: *u8, len: uint) -> ~str { let mut v = Vec::with_capacity(len); @@ -1373,12 +707,6 @@ pub mod raw { from_buf_len(buf as *u8, i as uint) } - /// Converts a slice of bytes to a string slice without checking - /// that the string contains valid UTF-8. - pub unsafe fn from_utf8<'a>(v: &'a [u8]) -> &'a str { - cast::transmute(v) - } - /// Converts an owned vector of bytes to a new owned string. This assumes /// that the utf-8-ness of the vector has already been validated #[inline] @@ -1389,50 +717,6 @@ pub mod raw { /// Converts a byte to a string. pub unsafe fn from_byte(u: u8) -> ~str { from_utf8_owned(box [u]) } - /// Form a slice from a C string. Unsafe because the caller must ensure the - /// C string has the static lifetime, or else the return value may be - /// invalidated later. - pub unsafe fn c_str_to_static_slice(s: *libc::c_char) -> &'static str { - let s = s as *u8; - let mut curr = s; - let mut len = 0u; - while *curr != 0u8 { - len += 1u; - curr = s.offset(len as int); - } - let v = Slice { data: s, len: len }; - assert!(is_utf8(::cast::transmute(v))); - ::cast::transmute(v) - } - - /// Takes a bytewise (not UTF-8) slice from a string. - /// - /// Returns the substring from [`begin`..`end`). - /// - /// # Failure - /// - /// If begin is greater than end. - /// If end is greater than the length of the string. - #[inline] - pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str { - assert!(begin <= end); - assert!(end <= s.len()); - slice_unchecked(s, begin, end) - } - - /// Takes a bytewise (not UTF-8) slice from a string. - /// - /// Returns the substring from [`begin`..`end`). - /// - /// Caller must check slice boundaries! - #[inline] - pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str { - cast::transmute(Slice { - data: s.as_ptr().offset(begin as int), - len: end - begin, - }) - } - /// Access the str in its vector representation. /// The caller must preserve the valid UTF-8 property when modifying. #[inline] @@ -1447,8 +731,11 @@ pub mod raw { /// the string is actually the specified size. #[test] fn test_from_buf_len() { + use slice::ImmutableVector; + use str::StrAllocating; + unsafe { - let a = box [65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8]; + let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8]; let b = a.as_ptr(); let c = from_buf_len(b, 3u); assert_eq!(c, "AAA".to_owned()); @@ -1460,95 +747,8 @@ pub mod raw { Section: Trait implementations */ -#[cfg(not(test))] -#[allow(missing_doc)] -pub mod traits { - use container::Container; - use cmp::{TotalOrd, Ordering, Less, Equal, Greater, Eq, Ord, Equiv, TotalEq}; - use iter::Iterator; - use ops::Add; - use option::{Some, None}; - use str::{Str, StrSlice, eq_slice}; - use strbuf::StrBuf; - - impl<'a> Add<&'a str,~str> for &'a str { - #[inline] - fn add(&self, rhs: & &'a str) -> ~str { - let mut ret = StrBuf::from_owned_str(self.to_owned()); - ret.push_str(*rhs); - ret.into_owned() - } - } - - impl<'a> TotalOrd for &'a str { - #[inline] - fn cmp(&self, other: & &'a str) -> Ordering { - for (s_b, o_b) in self.bytes().zip(other.bytes()) { - match s_b.cmp(&o_b) { - Greater => return Greater, - Less => return Less, - Equal => () - } - } - - self.len().cmp(&other.len()) - } - } - - impl TotalOrd for ~str { - #[inline] - fn cmp(&self, other: &~str) -> Ordering { self.as_slice().cmp(&other.as_slice()) } - } - - impl<'a> Eq for &'a str { - #[inline] - fn eq(&self, other: & &'a str) -> bool { - eq_slice((*self), (*other)) - } - #[inline] - fn ne(&self, other: & &'a str) -> bool { !(*self).eq(other) } - } - - impl Eq for ~str { - #[inline] - fn eq(&self, other: &~str) -> bool { - eq_slice((*self), (*other)) - } - } - - impl<'a> TotalEq for &'a str {} - - impl TotalEq for ~str {} - - impl<'a> Ord for &'a str { - #[inline] - fn lt(&self, other: & &'a str) -> bool { self.cmp(other) == Less } - } - - impl Ord for ~str { - #[inline] - fn lt(&self, other: &~str) -> bool { self.cmp(other) == Less } - } - - impl<'a, S: Str> Equiv for &'a str { - #[inline] - fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) } - } - - impl<'a, S: Str> Equiv for ~str { - #[inline] - fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) } - } -} - -#[cfg(test)] -pub mod traits {} - /// Any string that can be represented as a slice -pub trait Str { - /// Work with `self` as a slice. - fn as_slice<'a>(&'a self) -> &'a str; - +pub trait StrAllocating: Str { /// Convert `self` into a ~str, not making a copy if possible. fn into_owned(self) -> ~str; @@ -1563,453 +763,26 @@ pub trait Str { fn into_strbuf(self) -> StrBuf { StrBuf::from_owned_str(self.into_owned()) } -} - -impl<'a> Str for &'a str { - #[inline] - fn as_slice<'a>(&'a self) -> &'a str { *self } - - #[inline] - fn into_owned(self) -> ~str { self.to_owned() } -} - -impl<'a> Str for ~str { - #[inline] - fn as_slice<'a>(&'a self) -> &'a str { - let s: &'a str = *self; s - } - - #[inline] - fn into_owned(self) -> ~str { self } -} - -impl<'a> Container for &'a str { - #[inline] - fn len(&self) -> uint { - self.repr().len - } -} - -impl Container for ~str { - #[inline] - fn len(&self) -> uint { self.as_slice().len() } -} - -/// Methods for string slices -pub trait StrSlice<'a> { - /// Returns true if one string contains another - /// - /// # Arguments - /// - /// - needle - The string to look for - fn contains<'a>(&self, needle: &'a str) -> bool; - - /// Returns true if a string contains a char. - /// - /// # Arguments - /// - /// - needle - The char to look for - fn contains_char(&self, needle: char) -> bool; - - /// An iterator over the characters of `self`. Note, this iterates - /// over unicode code-points, not unicode graphemes. - /// - /// # Example - /// - /// ```rust - /// let v: ~[char] = "abc åäö".chars().collect(); - /// assert_eq!(v, ~['a', 'b', 'c', ' ', 'å', 'ä', 'ö']); - /// ``` - fn chars(&self) -> Chars<'a>; - - /// Do not use this - it is deprecated. - #[deprecated = "replaced by .chars().rev()"] - fn chars_rev(&self) -> Rev>; - - /// An iterator over the bytes of `self` - fn bytes(&self) -> Bytes<'a>; - - /// Do not use this - it is deprecated. - #[deprecated = "replaced by .bytes().rev()"] - fn bytes_rev(&self) -> Rev>; - - /// An iterator over the characters of `self` and their byte offsets. - fn char_indices(&self) -> CharOffsets<'a>; - - /// Do not use this - it is deprecated. - #[deprecated = "replaced by .char_indices().rev()"] - fn char_indices_rev(&self) -> Rev>; - - /// An iterator over substrings of `self`, separated by characters - /// matched by `sep`. - /// - /// # Example - /// - /// ```rust - /// let v: ~[&str] = "Mary had a little lamb".split(' ').collect(); - /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]); - /// - /// let v: ~[&str] = "abc1def2ghi".split(|c: char| c.is_digit()).collect(); - /// assert_eq!(v, ~["abc", "def", "ghi"]); - /// - /// let v: ~[&str] = "lionXXtigerXleopard".split('X').collect(); - /// assert_eq!(v, ~["lion", "", "tiger", "leopard"]); - /// ``` - fn split(&self, sep: Sep) -> CharSplits<'a, Sep>; - - /// An iterator over substrings of `self`, separated by characters - /// matched by `sep`, restricted to splitting at most `count` - /// times. - /// - /// # Example - /// - /// ```rust - /// let v: ~[&str] = "Mary had a little lambda".splitn(' ', 2).collect(); - /// assert_eq!(v, ~["Mary", "had", "a little lambda"]); - /// - /// let v: ~[&str] = "abc1def2ghi".splitn(|c: char| c.is_digit(), 1).collect(); - /// assert_eq!(v, ~["abc", "def2ghi"]); - /// - /// let v: ~[&str] = "lionXXtigerXleopard".splitn('X', 2).collect(); - /// assert_eq!(v, ~["lion", "", "tigerXleopard"]); - /// ``` - fn splitn(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>; - - /// An iterator over substrings of `self`, separated by characters - /// matched by `sep`. - /// - /// Equivalent to `split`, except that the trailing substring - /// is skipped if empty (terminator semantics). - /// - /// # Example - /// - /// ```rust - /// let v: ~[&str] = "A.B.".split_terminator('.').collect(); - /// assert_eq!(v, ~["A", "B"]); - /// - /// let v: ~[&str] = "A..B..".split_terminator('.').collect(); - /// assert_eq!(v, ~["A", "", "B", ""]); - /// - /// let v: ~[&str] = "Mary had a little lamb".split(' ').rev().collect(); - /// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]); - /// - /// let v: ~[&str] = "abc1def2ghi".split(|c: char| c.is_digit()).rev().collect(); - /// assert_eq!(v, ~["ghi", "def", "abc"]); - /// - /// let v: ~[&str] = "lionXXtigerXleopard".split('X').rev().collect(); - /// assert_eq!(v, ~["leopard", "tiger", "", "lion"]); - /// ``` - fn split_terminator(&self, sep: Sep) -> CharSplits<'a, Sep>; - - /// Do not use this - it is deprecated. - #[deprecated = "replaced by .split(sep).rev()"] - fn rsplit(&self, sep: Sep) -> Rev>; - - /// An iterator over substrings of `self`, separated by characters - /// matched by `sep`, starting from the end of the string. - /// Restricted to splitting at most `count` times. - /// - /// # Example - /// - /// ```rust - /// let v: ~[&str] = "Mary had a little lamb".rsplitn(' ', 2).collect(); - /// assert_eq!(v, ~["lamb", "little", "Mary had a"]); - /// - /// let v: ~[&str] = "abc1def2ghi".rsplitn(|c: char| c.is_digit(), 1).collect(); - /// assert_eq!(v, ~["ghi", "abc1def"]); - /// - /// let v: ~[&str] = "lionXXtigerXleopard".rsplitn('X', 2).collect(); - /// assert_eq!(v, ~["leopard", "tiger", "lionX"]); - /// ``` - fn rsplitn(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>; - - /// An iterator over the start and end indices of the disjoint - /// matches of `sep` within `self`. - /// - /// That is, each returned value `(start, end)` satisfies - /// `self.slice(start, end) == sep`. For matches of `sep` within - /// `self` that overlap, only the indicies corresponding to the - /// first match are returned. - /// - /// # Example - /// - /// ```rust - /// let v: ~[(uint, uint)] = "abcXXXabcYYYabc".match_indices("abc").collect(); - /// assert_eq!(v, ~[(0,3), (6,9), (12,15)]); - /// - /// let v: ~[(uint, uint)] = "1abcabc2".match_indices("abc").collect(); - /// assert_eq!(v, ~[(1,4), (4,7)]); - /// - /// let v: ~[(uint, uint)] = "ababa".match_indices("aba").collect(); - /// assert_eq!(v, ~[(0, 3)]); // only the first `aba` - /// ``` - fn match_indices(&self, sep: &'a str) -> MatchIndices<'a>; - - /// An iterator over the substrings of `self` separated by `sep`. - /// - /// # Example - /// - /// ```rust - /// let v: ~[&str] = "abcXXXabcYYYabc".split_str("abc").collect(); - /// assert_eq!(v, ~["", "XXX", "YYY", ""]); - /// - /// let v: ~[&str] = "1abcabc2".split_str("abc").collect(); - /// assert_eq!(v, ~["1", "", "2"]); - /// ``` - fn split_str(&self, &'a str) -> StrSplits<'a>; - - /// An iterator over the lines of a string (subsequences separated - /// by `\n`). This does not include the empty string after a - /// trailing `\n`. - /// - /// # Example - /// - /// ```rust - /// let four_lines = "foo\nbar\n\nbaz\n"; - /// let v: ~[&str] = four_lines.lines().collect(); - /// assert_eq!(v, ~["foo", "bar", "", "baz"]); - /// ``` - fn lines(&self) -> CharSplits<'a, char>; - - /// An iterator over the lines of a string, separated by either - /// `\n` or `\r\n`. As with `.lines()`, this does not include an - /// empty trailing line. - /// - /// # Example - /// - /// ```rust - /// let four_lines = "foo\r\nbar\n\r\nbaz\n"; - /// let v: ~[&str] = four_lines.lines_any().collect(); - /// assert_eq!(v, ~["foo", "bar", "", "baz"]); - /// ``` - fn lines_any(&self) -> AnyLines<'a>; - - /// An iterator over the words of a string (subsequences separated - /// by any sequence of whitespace). Sequences of whitespace are - /// collapsed, so empty "words" are not included. - /// - /// # Example - /// - /// ```rust - /// let some_words = " Mary had\ta little \n\t lamb"; - /// let v: ~[&str] = some_words.words().collect(); - /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]); - /// ``` - fn words(&self) -> Words<'a>; - - /// An Iterator over the string in Unicode Normalization Form D - /// (canonical decomposition). - fn nfd_chars(&self) -> Normalizations<'a>; - - /// An Iterator over the string in Unicode Normalization Form KD - /// (compatibility decomposition). - fn nfkd_chars(&self) -> Normalizations<'a>; - - /// Returns true if the string contains only whitespace. - /// - /// Whitespace characters are determined by `char::is_whitespace`. - /// - /// # Example - /// - /// ```rust - /// assert!(" \t\n".is_whitespace()); - /// assert!("".is_whitespace()); - /// - /// assert!( !"abc".is_whitespace()); - /// ``` - fn is_whitespace(&self) -> bool; - - /// Returns true if the string contains only alphanumeric code - /// points. - /// - /// Alphanumeric characters are determined by `char::is_alphanumeric`. - /// - /// # Example - /// - /// ```rust - /// assert!("Löwe老虎Léopard123".is_alphanumeric()); - /// assert!("".is_alphanumeric()); - /// - /// assert!( !" &*~".is_alphanumeric()); - /// ``` - fn is_alphanumeric(&self) -> bool; - - /// Returns the number of Unicode code points (`char`) that a - /// string holds. - /// - /// This does not perform any normalization, and is `O(n)`, since - /// UTF-8 is a variable width encoding of code points. - /// - /// *Warning*: The number of code points in a string does not directly - /// correspond to the number of visible characters or width of the - /// visible text due to composing characters, and double- and - /// zero-width ones. - /// - /// See also `.len()` for the byte length. - /// - /// # Example - /// - /// ```rust - /// // composed forms of `ö` and `é` - /// let c = "Löwe 老虎 Léopard"; // German, Simplified Chinese, French - /// // decomposed forms of `ö` and `é` - /// let d = "Lo\u0308we 老虎 Le\u0301opard"; - /// - /// assert_eq!(c.char_len(), 15); - /// assert_eq!(d.char_len(), 17); - /// - /// assert_eq!(c.len(), 21); - /// assert_eq!(d.len(), 23); - /// - /// // the two strings *look* the same - /// println!("{}", c); - /// println!("{}", d); - /// ``` - fn char_len(&self) -> uint; - - /// Returns a slice of the given string from the byte range - /// [`begin`..`end`). - /// - /// This operation is `O(1)`. - /// - /// Fails when `begin` and `end` do not point to valid characters - /// or point beyond the last character of the string. - /// - /// See also `slice_to` and `slice_from` for slicing prefixes and - /// suffixes of strings, and `slice_chars` for slicing based on - /// code point counts. - /// - /// # Example - /// - /// ```rust - /// let s = "Löwe 老虎 Léopard"; - /// assert_eq!(s.slice(0, 1), "L"); - /// - /// assert_eq!(s.slice(1, 9), "öwe 老"); - /// - /// // these will fail: - /// // byte 2 lies within `ö`: - /// // s.slice(2, 3); - /// - /// // byte 8 lies within `老` - /// // s.slice(1, 8); - /// - /// // byte 100 is outside the string - /// // s.slice(3, 100); - /// ``` - fn slice(&self, begin: uint, end: uint) -> &'a str; - - /// Returns a slice of the string from `begin` to its end. - /// - /// Equivalent to `self.slice(begin, self.len())`. - /// - /// Fails when `begin` does not point to a valid character, or is - /// out of bounds. - /// - /// See also `slice`, `slice_to` and `slice_chars`. - fn slice_from(&self, begin: uint) -> &'a str; - - /// Returns a slice of the string from the beginning to byte - /// `end`. - /// - /// Equivalent to `self.slice(0, end)`. - /// - /// Fails when `end` does not point to a valid character, or is - /// out of bounds. - /// - /// See also `slice`, `slice_from` and `slice_chars`. - fn slice_to(&self, end: uint) -> &'a str; - - /// Returns a slice of the string from the character range - /// [`begin`..`end`). - /// - /// That is, start at the `begin`-th code point of the string and - /// continue to the `end`-th code point. This does not detect or - /// handle edge cases such as leaving a combining character as the - /// first code point of the string. - /// - /// Due to the design of UTF-8, this operation is `O(end)`. - /// See `slice`, `slice_to` and `slice_from` for `O(1)` - /// variants that use byte indices rather than code point - /// indices. - /// - /// Fails if `begin` > `end` or the either `begin` or `end` are - /// beyond the last character of the string. - /// - /// # Example - /// - /// ```rust - /// let s = "Löwe 老虎 Léopard"; - /// assert_eq!(s.slice_chars(0, 4), "Löwe"); - /// assert_eq!(s.slice_chars(5, 7), "老虎"); - /// ``` - fn slice_chars(&self, begin: uint, end: uint) -> &'a str; - - /// Returns true if `needle` is a prefix of the string. - fn starts_with(&self, needle: &str) -> bool; - - /// Returns true if `needle` is a suffix of the string. - fn ends_with(&self, needle: &str) -> bool; /// Escape each char in `s` with `char::escape_default`. - fn escape_default(&self) -> ~str; + fn escape_default(&self) -> ~str { + let me = self.as_slice(); + let mut out = StrBuf::with_capacity(me.len()); + for c in me.chars() { + c.escape_default(|c| out.push_char(c)); + } + out.into_owned() + } /// Escape each char in `s` with `char::escape_unicode`. - fn escape_unicode(&self) -> ~str; - - /// Returns a string with leading and trailing whitespace removed. - fn trim(&self) -> &'a str; - - /// Returns a string with leading whitespace removed. - fn trim_left(&self) -> &'a str; - - /// Returns a string with trailing whitespace removed. - fn trim_right(&self) -> &'a str; - - /// Returns a string with characters that match `to_trim` removed. - /// - /// # Arguments - /// - /// * to_trim - a character matcher - /// - /// # Example - /// - /// ```rust - /// assert_eq!("11foo1bar11".trim_chars('1'), "foo1bar") - /// assert_eq!("12foo1bar12".trim_chars(&['1', '2']), "foo1bar") - /// assert_eq!("123foo1bar123".trim_chars(|c: char| c.is_digit()), "foo1bar") - /// ``` - fn trim_chars(&self, to_trim: C) -> &'a str; - - /// Returns a string with leading `chars_to_trim` removed. - /// - /// # Arguments - /// - /// * to_trim - a character matcher - /// - /// # Example - /// - /// ```rust - /// assert_eq!("11foo1bar11".trim_left_chars('1'), "foo1bar11") - /// assert_eq!("12foo1bar12".trim_left_chars(&['1', '2']), "foo1bar12") - /// assert_eq!("123foo1bar123".trim_left_chars(|c: char| c.is_digit()), "foo1bar123") - /// ``` - fn trim_left_chars(&self, to_trim: C) -> &'a str; - - /// Returns a string with trailing `chars_to_trim` removed. - /// - /// # Arguments - /// - /// * to_trim - a character matcher - /// - /// # Example - /// - /// ```rust - /// assert_eq!("11foo1bar11".trim_right_chars('1'), "11foo1bar") - /// assert_eq!("12foo1bar12".trim_right_chars(&['1', '2']), "12foo1bar") - /// assert_eq!("123foo1bar123".trim_right_chars(|c: char| c.is_digit()), "123foo1bar") - /// ``` - fn trim_right_chars(&self, to_trim: C) -> &'a str; + fn escape_unicode(&self) -> ~str { + let me = self.as_slice(); + let mut out = StrBuf::with_capacity(me.len()); + for c in me.chars() { + c.escape_unicode(|c| out.push_char(c)); + } + out.into_owned() + } /// Replace all occurrences of one string with another. /// @@ -2035,529 +808,38 @@ pub trait StrSlice<'a> { /// // not found, so no change. /// assert_eq!(s.replace("cookie monster", "little lamb"), s); /// ``` - fn replace(&self, from: &str, to: &str) -> ~str; - - /// Copy a slice into a new owned str. - fn to_owned(&self) -> ~str; - - /// Converts to a vector of `u16` encoded as UTF-16. - fn to_utf16(&self) -> ~[u16]; - - /// Check that `index`-th byte lies at the start and/or end of a - /// UTF-8 code point sequence. - /// - /// The start and end of the string (when `index == self.len()`) - /// are considered to be boundaries. - /// - /// Fails if `index` is greater than `self.len()`. - /// - /// # Example - /// - /// ```rust - /// let s = "Löwe 老虎 Léopard"; - /// assert!(s.is_char_boundary(0)); - /// // start of `老` - /// assert!(s.is_char_boundary(6)); - /// assert!(s.is_char_boundary(s.len())); - /// - /// // second byte of `ö` - /// assert!(!s.is_char_boundary(2)); - /// - /// // third byte of `老` - /// assert!(!s.is_char_boundary(8)); - /// ``` - fn is_char_boundary(&self, index: uint) -> bool; - - /// Pluck a character out of a string and return the index of the next - /// character. - /// - /// This function can be used to iterate over the unicode characters of a - /// string. - /// - /// # Example - /// - /// This example manually iterate through the characters of a - /// string; this should normally by done by `.chars()` or - /// `.char_indices`. - /// - /// ```rust - /// use std::str::CharRange; - /// - /// let s = "中华Việt Nam"; - /// let mut i = 0u; - /// while i < s.len() { - /// let CharRange {ch, next} = s.char_range_at(i); - /// println!("{}: {}", i, ch); - /// i = next; - /// } - /// ``` - /// - /// ## Output - /// - /// ```ignore - /// 0: 中 - /// 3: 华 - /// 6: V - /// 7: i - /// 8: ệ - /// 11: t - /// 12: - /// 13: N - /// 14: a - /// 15: m - /// ``` - /// - /// # Arguments - /// - /// * s - The string - /// * i - The byte offset of the char to extract - /// - /// # Return value - /// - /// A record {ch: char, next: uint} containing the char value and the byte - /// index of the next unicode character. - /// - /// # Failure - /// - /// If `i` is greater than or equal to the length of the string. - /// If `i` is not the index of the beginning of a valid UTF-8 character. - fn char_range_at(&self, start: uint) -> CharRange; - - /// Given a byte position and a str, return the previous char and its position. - /// - /// This function can be used to iterate over a unicode string in reverse. - /// - /// Returns 0 for next index if called on start index 0. - fn char_range_at_reverse(&self, start: uint) -> CharRange; - - /// Plucks the character starting at the `i`th byte of a string - fn char_at(&self, i: uint) -> char; - - /// Plucks the character ending at the `i`th byte of a string - fn char_at_reverse(&self, i: uint) -> char; - - /// Work with the byte buffer of a string as a byte slice. - fn as_bytes(&self) -> &'a [u8]; - - /// Returns the byte index of the first character of `self` that - /// matches `search`. - /// - /// # Return value - /// - /// `Some` containing the byte index of the last matching character - /// or `None` if there is no match - /// - /// # Example - /// - /// ```rust - /// let s = "Löwe 老虎 Léopard"; - /// - /// assert_eq!(s.find('L'), Some(0)); - /// assert_eq!(s.find('é'), Some(14)); - /// - /// // the first space - /// assert_eq!(s.find(|c: char| c.is_whitespace()), Some(5)); - /// - /// // neither are found - /// assert_eq!(s.find(&['1', '2']), None); - /// ``` - fn find(&self, search: C) -> Option; - - /// Returns the byte index of the last character of `self` that - /// matches `search`. - /// - /// # Return value - /// - /// `Some` containing the byte index of the last matching character - /// or `None` if there is no match. - /// - /// # Example - /// - /// ```rust - /// let s = "Löwe 老虎 Léopard"; - /// - /// assert_eq!(s.rfind('L'), Some(13)); - /// assert_eq!(s.rfind('é'), Some(14)); - /// - /// // the second space - /// assert_eq!(s.rfind(|c: char| c.is_whitespace()), Some(12)); - /// - /// // searches for an occurrence of either `1` or `2`, but neither are found - /// assert_eq!(s.rfind(&['1', '2']), None); - /// ``` - fn rfind(&self, search: C) -> Option; - - /// Returns the byte index of the first matching substring - /// - /// # Arguments - /// - /// * `needle` - The string to search for - /// - /// # Return value - /// - /// `Some` containing the byte index of the first matching substring - /// or `None` if there is no match. - /// - /// # Example - /// - /// ```rust - /// let s = "Löwe 老虎 Léopard"; - /// - /// assert_eq!(s.find_str("老虎 L"), Some(6)); - /// assert_eq!(s.find_str("muffin man"), None); - /// ``` - fn find_str(&self, &str) -> Option; - - /// Given a string, make a new string with repeated copies of it. - fn repeat(&self, nn: uint) -> ~str; - - /// Retrieves the first character from a string slice and returns - /// it. This does not allocate a new string; instead, it returns a - /// slice that point one character beyond the character that was - /// shifted. If the string does not contain any characters, - /// a tuple of None and an empty string is returned instead. - /// - /// # Example - /// - /// ```rust - /// let s = "Löwe 老虎 Léopard"; - /// let (c, s1) = s.slice_shift_char(); - /// assert_eq!(c, Some('L')); - /// assert_eq!(s1, "öwe 老虎 Léopard"); - /// - /// let (c, s2) = s1.slice_shift_char(); - /// assert_eq!(c, Some('ö')); - /// assert_eq!(s2, "we 老虎 Léopard"); - /// ``` - fn slice_shift_char(&self) -> (Option, &'a str); - - /// Levenshtein Distance between two strings. - fn lev_distance(&self, t: &str) -> uint; - - /// Returns the byte offset of an inner slice relative to an enclosing outer slice. - /// - /// Fails if `inner` is not a direct slice contained within self. - /// - /// # Example - /// - /// ```rust - /// let string = "a\nb\nc"; - /// let lines: ~[&str] = string.lines().collect(); - /// - /// assert!(string.subslice_offset(lines[0]) == 0); // &"a" - /// assert!(string.subslice_offset(lines[1]) == 2); // &"b" - /// assert!(string.subslice_offset(lines[2]) == 4); // &"c" - /// ``` - fn subslice_offset(&self, inner: &str) -> uint; - - /// Return an unsafe pointer to the strings buffer. - /// - /// The caller must ensure that the string outlives this pointer, - /// and that it is not reallocated (e.g. by pushing to the - /// string). - fn as_ptr(&self) -> *u8; -} - -impl<'a> StrSlice<'a> for &'a str { - #[inline] - fn contains<'a>(&self, needle: &'a str) -> bool { - self.find_str(needle).is_some() - } - - #[inline] - fn contains_char(&self, needle: char) -> bool { - self.find(needle).is_some() - } - - #[inline] - fn chars(&self) -> Chars<'a> { - Chars{string: *self} - } - - #[inline] - #[deprecated = "replaced by .chars().rev()"] - fn chars_rev(&self) -> Rev> { - self.chars().rev() - } - - #[inline] - fn bytes(&self) -> Bytes<'a> { - self.as_bytes().iter().map(|&b| b) - } - - #[inline] - #[deprecated = "replaced by .bytes().rev()"] - fn bytes_rev(&self) -> Rev> { - self.bytes().rev() - } - - #[inline] - fn char_indices(&self) -> CharOffsets<'a> { - CharOffsets{string: *self, iter: self.chars()} - } - - #[inline] - #[deprecated = "replaced by .char_indices().rev()"] - fn char_indices_rev(&self) -> Rev> { - self.char_indices().rev() - } - - #[inline] - fn split(&self, sep: Sep) -> CharSplits<'a, Sep> { - CharSplits { - string: *self, - only_ascii: sep.only_ascii(), - sep: sep, - allow_trailing_empty: true, - finished: false, - } - } - - #[inline] - fn splitn(&self, sep: Sep, count: uint) - -> CharSplitsN<'a, Sep> { - CharSplitsN { - iter: self.split(sep), - count: count, - invert: false, - } - } - - #[inline] - fn split_terminator(&self, sep: Sep) - -> CharSplits<'a, Sep> { - CharSplits { - allow_trailing_empty: false, - ..self.split(sep) - } - } - - #[inline] - #[deprecated = "replaced by .split(sep).rev()"] - fn rsplit(&self, sep: Sep) -> Rev> { - self.split(sep).rev() - } - - #[inline] - fn rsplitn(&self, sep: Sep, count: uint) - -> CharSplitsN<'a, Sep> { - CharSplitsN { - iter: self.split(sep), - count: count, - invert: true, - } - } - - #[inline] - fn match_indices(&self, sep: &'a str) -> MatchIndices<'a> { - assert!(!sep.is_empty()) - MatchIndices { - haystack: *self, - needle: sep, - position: 0 - } - } - - #[inline] - fn split_str(&self, sep: &'a str) -> StrSplits<'a> { - StrSplits { - it: self.match_indices(sep), - last_end: 0, - finished: false - } - } - - #[inline] - fn lines(&self) -> CharSplits<'a, char> { - self.split_terminator('\n') - } - - fn lines_any(&self) -> AnyLines<'a> { - self.lines().map(|line| { - let l = line.len(); - if l > 0 && line[l - 1] == '\r' as u8 { line.slice(0, l - 1) } - else { line } - }) - } - - #[inline] - fn words(&self) -> Words<'a> { - self.split(char::is_whitespace).filter(|s| !s.is_empty()) - } - - #[inline] - fn nfd_chars(&self) -> Normalizations<'a> { - Normalizations { - iter: self.chars(), - buffer: Vec::new(), - sorted: false, - kind: NFD - } - } - - #[inline] - fn nfkd_chars(&self) -> Normalizations<'a> { - Normalizations { - iter: self.chars(), - buffer: Vec::new(), - sorted: false, - kind: NFKD - } - } - - #[inline] - fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) } - - #[inline] - fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) } - - #[inline] - fn char_len(&self) -> uint { self.chars().len() } - - #[inline] - fn slice(&self, begin: uint, end: uint) -> &'a str { - assert!(self.is_char_boundary(begin) && self.is_char_boundary(end)); - unsafe { raw::slice_bytes(*self, begin, end) } - } - - #[inline] - fn slice_from(&self, begin: uint) -> &'a str { - self.slice(begin, self.len()) - } - - #[inline] - fn slice_to(&self, end: uint) -> &'a str { - assert!(self.is_char_boundary(end)); - unsafe { raw::slice_bytes(*self, 0, end) } - } - - fn slice_chars(&self, begin: uint, end: uint) -> &'a str { - assert!(begin <= end); - let mut count = 0; - let mut begin_byte = None; - let mut end_byte = None; - - // This could be even more efficient by not decoding, - // only finding the char boundaries - for (idx, _) in self.char_indices() { - if count == begin { begin_byte = Some(idx); } - if count == end { end_byte = Some(idx); break; } - count += 1; - } - if begin_byte.is_none() && count == begin { begin_byte = Some(self.len()) } - if end_byte.is_none() && count == end { end_byte = Some(self.len()) } - - match (begin_byte, end_byte) { - (None, _) => fail!("slice_chars: `begin` is beyond end of string"), - (_, None) => fail!("slice_chars: `end` is beyond end of string"), - (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) } - } - } - - #[inline] - fn starts_with<'a>(&self, needle: &'a str) -> bool { - let n = needle.len(); - self.len() >= n && needle.as_bytes() == self.as_bytes().slice_to(n) - } - - #[inline] - fn ends_with(&self, needle: &str) -> bool { - let (m, n) = (self.len(), needle.len()); - m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n) - } - - fn escape_default(&self) -> ~str { - let mut out = StrBuf::with_capacity(self.len()); - for c in self.chars() { - c.escape_default(|c| out.push_char(c)); - } - out.into_owned() - } - - fn escape_unicode(&self) -> ~str { - let mut out = StrBuf::with_capacity(self.len()); - for c in self.chars() { - c.escape_unicode(|c| out.push_char(c)); - } - out.into_owned() - } - - #[inline] - fn trim(&self) -> &'a str { - self.trim_left().trim_right() - } - - #[inline] - fn trim_left(&self) -> &'a str { - self.trim_left_chars(char::is_whitespace) - } - - #[inline] - fn trim_right(&self) -> &'a str { - self.trim_right_chars(char::is_whitespace) - } - - #[inline] - fn trim_chars(&self, mut to_trim: C) -> &'a str { - let cur = match self.find(|c: char| !to_trim.matches(c)) { - None => "", - Some(i) => unsafe { raw::slice_bytes(*self, i, self.len()) } - }; - match cur.rfind(|c: char| !to_trim.matches(c)) { - None => "", - Some(i) => { - let right = cur.char_range_at(i).next; - unsafe { raw::slice_bytes(cur, 0, right) } - } - } - } - - #[inline] - fn trim_left_chars(&self, mut to_trim: C) -> &'a str { - match self.find(|c: char| !to_trim.matches(c)) { - None => "", - Some(first) => unsafe { raw::slice_bytes(*self, first, self.len()) } - } - } - - #[inline] - fn trim_right_chars(&self, mut to_trim: C) -> &'a str { - match self.rfind(|c: char| !to_trim.matches(c)) { - None => "", - Some(last) => { - let next = self.char_range_at(last).next; - unsafe { raw::slice_bytes(*self, 0u, next) } - } - } - } - fn replace(&self, from: &str, to: &str) -> ~str { + let me = self.as_slice(); let mut result = StrBuf::new(); let mut last_end = 0; - for (start, end) in self.match_indices(from) { - result.push_str(unsafe{raw::slice_bytes(*self, last_end, start)}); + for (start, end) in me.match_indices(from) { + result.push_str(unsafe{raw::slice_bytes(me, last_end, start)}); result.push_str(to); last_end = end; } - result.push_str(unsafe{raw::slice_bytes(*self, last_end, self.len())}); + result.push_str(unsafe{raw::slice_bytes(me, last_end, me.len())}); result.into_owned() } + /// Copy a slice into a new owned str. #[inline] fn to_owned(&self) -> ~str { - let len = self.len(); + let me = self.as_slice(); + let len = me.len(); unsafe { let mut v = Vec::with_capacity(len); - ptr::copy_memory(v.as_mut_ptr(), self.as_ptr(), len); + ptr::copy_memory(v.as_mut_ptr(), me.as_ptr(), len); v.set_len(len); ::cast::transmute(v.move_iter().collect::<~[u8]>()) } } + /// Converts to a vector of `u16` encoded as UTF-16. fn to_utf16(&self) -> ~[u16] { + let me = self.as_slice(); let mut u = Vec::new();; - for ch in self.chars() { + for ch in me.chars() { let mut buf = [0u16, ..2]; let n = ch.encode_utf16(buf /* as mut slice! */); u.push_all(buf.slice_to(n)); @@ -2565,133 +847,20 @@ impl<'a> StrSlice<'a> for &'a str { u.move_iter().collect() } - #[inline] - fn is_char_boundary(&self, index: uint) -> bool { - if index == self.len() { return true; } - let b = self[index]; - return b < 128u8 || b >= 192u8; - } - - #[inline] - fn char_range_at(&self, i: uint) -> CharRange { - if self[i] < 128u8 { - return CharRange {ch: self[i] as char, next: i + 1 }; - } - - // Multibyte case is a fn to allow char_range_at to inline cleanly - fn multibyte_char_range_at(s: &str, i: uint) -> CharRange { - let mut val = s[i] as u32; - let w = UTF8_CHAR_WIDTH[val as uint] as uint; - assert!((w != 0)); - - val = utf8_first_byte!(val, w); - val = utf8_acc_cont_byte!(val, s[i + 1]); - if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); } - if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); } - - return CharRange {ch: unsafe { transmute(val) }, next: i + w}; - } - - return multibyte_char_range_at(*self, i); - } - - #[inline] - fn char_range_at_reverse(&self, start: uint) -> CharRange { - let mut prev = start; - - prev = prev.saturating_sub(1); - if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} } - - // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly - fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange { - // while there is a previous byte == 10...... - while i > 0 && s[i] & 192u8 == TAG_CONT_U8 { - i -= 1u; - } - - let mut val = s[i] as u32; - let w = UTF8_CHAR_WIDTH[val as uint] as uint; - assert!((w != 0)); - - val = utf8_first_byte!(val, w); - val = utf8_acc_cont_byte!(val, s[i + 1]); - if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); } - if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); } - - return CharRange {ch: unsafe { transmute(val) }, next: i}; - } - - return multibyte_char_range_at_reverse(*self, prev); - } - - #[inline] - fn char_at(&self, i: uint) -> char { - self.char_range_at(i).ch - } - - #[inline] - fn char_at_reverse(&self, i: uint) -> char { - self.char_range_at_reverse(i).ch - } - - #[inline] - fn as_bytes(&self) -> &'a [u8] { - unsafe { cast::transmute(*self) } - } - - fn find(&self, mut search: C) -> Option { - if search.only_ascii() { - self.bytes().position(|b| search.matches(b as char)) - } else { - for (index, c) in self.char_indices() { - if search.matches(c) { return Some(index); } - } - None - } - } - - fn rfind(&self, mut search: C) -> Option { - if search.only_ascii() { - self.bytes().rposition(|b| search.matches(b as char)) - } else { - for (index, c) in self.char_indices().rev() { - if search.matches(c) { return Some(index); } - } - None - } - } - - fn find_str(&self, needle: &str) -> Option { - if needle.is_empty() { - Some(0) - } else { - self.match_indices(needle) - .next() - .map(|(start, _end)| start) - } - } - + /// Given a string, make a new string with repeated copies of it. fn repeat(&self, nn: uint) -> ~str { - let mut ret = StrBuf::with_capacity(nn * self.len()); + let me = self.as_slice(); + let mut ret = StrBuf::with_capacity(nn * me.len()); for _ in range(0, nn) { - ret.push_str(*self); + ret.push_str(me); } ret.into_owned() } - #[inline] - fn slice_shift_char(&self) -> (Option, &'a str) { - if self.is_empty() { - return (None, *self); - } else { - let CharRange {ch, next} = self.char_range_at(0u); - let next_s = unsafe { raw::slice_bytes(*self, next, self.len()) }; - return (Some(ch), next_s); - } - } - + /// Levenshtein Distance between two strings. fn lev_distance(&self, t: &str) -> uint { - let slen = self.len(); + let me = self.as_slice(); + let slen = me.len(); let tlen = t.len(); if slen == 0 { return tlen; } @@ -2699,7 +868,7 @@ impl<'a> StrSlice<'a> for &'a str { let mut dcol = Vec::from_fn(tlen + 1, |x| x); - for (i, sc) in self.chars().enumerate() { + for (i, sc) in me.chars().enumerate() { let mut current = i; *dcol.get_mut(0) = current + 1; @@ -2723,21 +892,39 @@ impl<'a> StrSlice<'a> for &'a str { return *dcol.get(tlen); } - fn subslice_offset(&self, inner: &str) -> uint { - let a_start = self.as_ptr() as uint; - let a_end = a_start + self.len(); - let b_start = inner.as_ptr() as uint; - let b_end = b_start + inner.len(); - - assert!(a_start <= b_start); - assert!(b_end <= a_end); - b_start - a_start - } - + /// An Iterator over the string in Unicode Normalization Form D + /// (canonical decomposition). #[inline] - fn as_ptr(&self) -> *u8 { - self.repr().data + fn nfd_chars<'a>(&'a self) -> Normalizations<'a> { + Normalizations { + iter: self.as_slice().chars(), + buffer: Vec::new(), + sorted: false, + kind: NFD + } } + + /// An Iterator over the string in Unicode Normalization Form KD + /// (compatibility decomposition). + #[inline] + fn nfkd_chars<'a>(&'a self) -> Normalizations<'a> { + Normalizations { + iter: self.as_slice().chars(), + buffer: Vec::new(), + sorted: false, + kind: NFKD + } + } +} + +impl<'a> StrAllocating for &'a str { + #[inline] + fn into_owned(self) -> ~str { self.to_owned() } +} + +impl<'a> StrAllocating for ~str { + #[inline] + fn into_owned(self) -> ~str { self } } /// Methods for owned strings @@ -2765,32 +952,6 @@ impl OwnedStr for ~str { } } -impl Clone for ~str { - #[inline] - fn clone(&self) -> ~str { - self.to_owned() - } -} - -impl FromIterator for ~str { - #[inline] - fn from_iter>(iterator: T) -> ~str { - let (lower, _) = iterator.size_hint(); - let mut buf = StrBuf::with_capacity(lower); - buf.extend(iterator); - buf.into_owned() - } -} - -// This works because every lifetime is a sub-lifetime of 'static -impl<'a> Default for &'a str { - fn default() -> &'a str { "" } -} - -impl Default for ~str { - fn default() -> ~str { "".to_owned() } -} - #[cfg(test)] mod tests { use iter::AdditiveIterator; diff --git a/src/libstd/strbuf.rs b/src/libstd/strbuf.rs index 86da11aceda..ad703b8054b 100644 --- a/src/libstd/strbuf.rs +++ b/src/libstd/strbuf.rs @@ -20,8 +20,8 @@ use iter::{Extendable, FromIterator, Iterator, range}; use option::{None, Option, Some}; use ptr::RawPtr; use slice::{OwnedVector, Vector}; +use str::{OwnedStr, Str, StrSlice, StrAllocating}; use str; -use str::{OwnedStr, Str, StrSlice}; use vec::Vec; /// A growable string stored as a UTF-8 encoded buffer. @@ -268,7 +268,9 @@ impl Str for StrBuf { cast::transmute(self.vec.as_slice()) } } +} +impl StrAllocating for StrBuf { #[inline] fn into_owned(self) -> ~str { let StrBuf { diff --git a/src/libstd/task.rs b/src/libstd/task.rs index 7c5e984ad36..23831e40a8b 100644 --- a/src/libstd/task.rs +++ b/src/libstd/task.rs @@ -49,7 +49,7 @@ use str::{Str, SendStr, IntoMaybeOwned}; #[cfg(test)] use any::{AnyOwnExt, AnyRefExt}; #[cfg(test)] use result; -#[cfg(test)] use str::StrSlice; +#[cfg(test)] use str::StrAllocating; /// Indicates the manner in which a task exited. /// diff --git a/src/libstd/to_str.rs b/src/libstd/to_str.rs index bdeae0340e2..4132c8a5b5a 100644 --- a/src/libstd/to_str.rs +++ b/src/libstd/to_str.rs @@ -35,7 +35,7 @@ impl ToStr for T { #[cfg(test)] mod tests { use super::*; - use str::StrSlice; + use str::StrAllocating; #[test] fn test_simple_types() { diff --git a/src/libsyntax/util/interner.rs b/src/libsyntax/util/interner.rs index b7932da8738..d705da7b72b 100644 --- a/src/libsyntax/util/interner.rs +++ b/src/libsyntax/util/interner.rs @@ -109,11 +109,6 @@ impl Str for RcStr { let s: &'a str = *self.string; s } - - #[inline] - fn into_owned(self) -> ~str { - self.string.to_owned() - } } impl fmt::Show for RcStr {