diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs index bb03575b3ac..8c9346639b3 100644 --- a/src/libcollections/str.rs +++ b/src/libcollections/str.rs @@ -55,34 +55,31 @@ use self::MaybeOwned::*; use self::RecompositionState::*; use self::DecompositionType::*; +use core::prelude::*; + use core::borrow::{BorrowFrom, Cow, ToOwned}; -use core::clone::Clone; +use core::cmp::{mod, Equiv, PartialEq, Eq, PartialOrd, Ord, Ordering}; use core::default::Default; use core::fmt; use core::hash; -use core::char::Char; -use core::cmp::{mod, Eq, Equiv, Ord, Ordering, PartialEq, PartialOrd}; -use core::iter::{range, AdditiveIterator, Iterator, IteratorExt}; -use core::kinds::Sized; -use core::option::Option::{mod, Some, None}; -use core::slice::{AsSlice, SliceExt}; +use core::iter::AdditiveIterator; +use core::iter::{mod, range, Iterator, IteratorExt}; +use core::str as core_str; +use unicode::str::{UnicodeStr, Utf16Encoder}; use ring_buf::RingBuf; -use string::String; +use string::{String, ToString}; use unicode; use vec::Vec; -pub use core::str::{from_utf8, CharEq, Chars, CharOffsets}; +pub use core::str::{from_utf8, CharEq, Chars, CharIndices}; pub use core::str::{Bytes, CharSplits}; -pub use core::str::{CharSplitsN, AnyLines, MatchIndices, StrSplits}; -pub use core::str::{Utf16Encoder, Utf16CodeUnits}; -pub use core::str::{eq_slice, is_utf8, is_utf16, Utf16Items}; -pub use core::str::{Utf16Item, ScalarValue, LoneSurrogate, utf16_items}; -pub use core::str::{truncate_utf16_at_nul, utf8_char_width, CharRange}; -pub use core::str::{FromStr, from_str}; -pub use core::str::{Str, StrPrelude}; +pub use core::str::{CharSplitsN, Lines, LinesAny, MatchIndices, StrSplits}; +pub use core::str::{CharRange}; +pub use core::str::{FromStr, from_str, Utf8Error}; +pub use core::str::Str; pub use core::str::{from_utf8_unchecked, from_c_str}; -pub use unicode::str::{UnicodeStrPrelude, Words, Graphemes, GraphemeIndices}; +pub use unicode::str::{Words, Graphemes, GraphemeIndices}; // FIXME(conventions): ensure bit/char conventions are followed by str's API @@ -91,6 +88,7 @@ Section: Creating a string */ /// Methods for vectors of strings. +#[unstable = "functionality may be replaced with iterators"] pub trait StrVector for Sized? { /// Concatenates a vector of strings. /// @@ -117,6 +115,7 @@ pub trait StrVector for Sized? { fn connect(&self, sep: &str) -> String; } +#[allow(deprecated)] impl StrVector for [S] { fn concat(&self) -> String { if self.is_empty() { @@ -129,7 +128,7 @@ impl StrVector for [S] { let mut result = String::with_capacity(len); for s in self.iter() { - result.push_str(s.as_slice()) + result.push_str(s.as_slice()); } result @@ -379,6 +378,21 @@ impl<'a> Iterator for Recompositions<'a> { } } +/// External iterator for a string's UTF16 codeunits. +/// Use with the `std::iter` module. +#[deriving(Clone)] +pub struct Utf16Units<'a> { + encoder: Utf16Encoder> +} + +impl<'a> Iterator for Utf16Units<'a> { + #[inline] + fn next(&mut self) -> Option { self.encoder.next() } + + #[inline] + fn size_hint(&self) -> (uint, Option) { self.encoder.size_hint() } +} + /// Replaces all occurrences of one string with another. /// /// # Arguments @@ -399,16 +413,9 @@ impl<'a> Iterator for Recompositions<'a> { /// let new_string = str::replace(string, "or", "str"); /// assert_eq!(new_string.as_slice(), "strange"); /// ``` +#[deprecated = "call the inherent method instead"] pub fn replace(s: &str, from: &str, to: &str) -> String { - let mut result = String::new(); - let mut last_end = 0; - for (start, end) in s.match_indices(from) { - result.push_str(unsafe { s.slice_unchecked(last_end, start) }); - result.push_str(to); - last_end = end; - } - result.push_str(unsafe { s.slice_unchecked(last_end, s.len()) }); - result + s.replace(from, to) } /* @@ -434,7 +441,7 @@ Section: MaybeOwned /// A string type that can hold either a `String` or a `&str`. /// This can be useful as an optimization when an allocation is sometimes /// needed but not always. -#[deprecated = "use std::str::CowString"] +#[deprecated = "use stding::string::CowString"] pub enum MaybeOwned<'a> { /// A borrowed string. Slice(&'a str), @@ -443,9 +450,10 @@ pub enum MaybeOwned<'a> { } /// A specialization of `CowString` to be sendable. +#[deprecated = "use std::string::CowString<'static>"] pub type SendStr = CowString<'static>; -#[deprecated = "use std::str::CowString"] +#[deprecated = "use std::string::CowString"] impl<'a> MaybeOwned<'a> { /// Returns `true` if this `MaybeOwned` wraps an owned string. /// @@ -483,6 +491,7 @@ impl<'a> MaybeOwned<'a> { /// Return the number of bytes in this string. #[inline] + #[allow(deprecated)] pub fn len(&self) -> uint { self.as_slice().len() } /// Returns true if the string contains no bytes @@ -545,7 +554,8 @@ impl<'a> IntoMaybeOwned<'a> for MaybeOwned<'a> { fn into_maybe_owned(self) -> MaybeOwned<'a> { self } } -#[deprecated = "use std::str::CowString"] +#[deprecated = "use std::string::CowString"] +#[allow(deprecated)] impl<'a> PartialEq for MaybeOwned<'a> { #[inline] fn eq(&self, other: &MaybeOwned) -> bool { @@ -553,10 +563,10 @@ impl<'a> PartialEq for MaybeOwned<'a> { } } -#[deprecated = "use std::str::CowString"] +#[deprecated = "use std::string::CowString"] impl<'a> Eq for MaybeOwned<'a> {} -#[deprecated = "use std::str::CowString"] +#[deprecated = "use std::string::CowString"] impl<'a> PartialOrd for MaybeOwned<'a> { #[inline] fn partial_cmp(&self, other: &MaybeOwned) -> Option { @@ -564,16 +574,17 @@ impl<'a> PartialOrd for MaybeOwned<'a> { } } -#[deprecated = "use std::str::CowString"] +#[deprecated = "use std::string::CowString"] impl<'a> Ord for MaybeOwned<'a> { #[inline] + #[allow(deprecated)] fn cmp(&self, other: &MaybeOwned) -> Ordering { self.as_slice().cmp(other.as_slice()) } } #[allow(deprecated)] -#[deprecated = "use std::str::CowString"] +#[deprecated = "use std::string::CowString"] impl<'a, S: Str> Equiv for MaybeOwned<'a> { #[inline] fn equiv(&self, other: &S) -> bool { @@ -581,9 +592,9 @@ impl<'a, S: Str> Equiv for MaybeOwned<'a> { } } -#[deprecated = "use std::str::CowString"] +#[deprecated = "use std::string::CowString"] +#[allow(deprecated)] impl<'a> Str for MaybeOwned<'a> { - #[allow(deprecated)] #[inline] fn as_slice<'b>(&'b self) -> &'b str { match *self { @@ -593,19 +604,7 @@ impl<'a> Str for MaybeOwned<'a> { } } -#[deprecated = "use std::str::CowString"] -impl<'a> StrAllocating for MaybeOwned<'a> { - #[allow(deprecated)] - #[inline] - fn into_string(self) -> String { - match self { - Slice(s) => String::from_str(s), - Owned(s) => s - } - } -} - -#[deprecated = "use std::str::CowString"] +#[deprecated = "use std::string::CowString"] impl<'a> Clone for MaybeOwned<'a> { #[allow(deprecated)] #[inline] @@ -617,14 +616,15 @@ impl<'a> Clone for MaybeOwned<'a> { } } -#[deprecated = "use std::str::CowString"] +#[deprecated = "use std::string::CowString"] impl<'a> Default for MaybeOwned<'a> { #[allow(deprecated)] #[inline] fn default() -> MaybeOwned<'a> { Slice("") } } -#[deprecated = "use std::str::CowString"] +#[deprecated = "use std::string::CowString"] +#[allow(deprecated)] impl<'a, H: hash::Writer> hash::Hash for MaybeOwned<'a> { #[inline] fn hash(&self, hasher: &mut H) { @@ -632,7 +632,7 @@ impl<'a, H: hash::Writer> hash::Hash for MaybeOwned<'a> { } } -#[deprecated = "use std::str::CowString"] +#[deprecated = "use std::string::CowString"] impl<'a> fmt::Show for MaybeOwned<'a> { #[inline] fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { @@ -650,10 +650,11 @@ impl BorrowFrom for str { #[unstable = "trait is unstable"] impl ToOwned for str { - fn to_owned(&self) -> String { self.into_string() } + fn to_owned(&self) -> String { self.to_string() } } /// Unsafe string operations. +#[deprecated] pub mod raw { pub use core::str::raw::{from_utf8, c_str_to_static_slice, slice_bytes}; pub use core::str::raw::{slice_unchecked}; @@ -664,46 +665,25 @@ Section: CowString */ /// A clone-on-write string +#[deprecated = "use std::string::CowString instead"] pub type CowString<'a> = Cow<'a, String, str>; -impl<'a> Str for CowString<'a> { - #[inline] - fn as_slice<'b>(&'b self) -> &'b str { - (**self).as_slice() - } -} - /* Section: Trait implementations */ /// Any string that can be represented as a slice. -pub trait StrAllocating: Str { - /// Converts `self` into a `String`, not making a copy if possible. - fn into_string(self) -> String; - +pub trait StrExt for Sized?: Slice { /// Escapes each char in `s` with `char::escape_default`. + #[unstable = "return type may change to be an iterator"] fn escape_default(&self) -> String { - let me = self.as_slice(); - let mut out = String::with_capacity(me.len()); - for c in me.chars() { - for c in c.escape_default() { - out.push(c); - } - } - out + self.chars().flat_map(|c| c.escape_default()).collect() } /// Escapes each char in `s` with `char::escape_unicode`. + #[unstable = "return type may change to be an iterator"] fn escape_unicode(&self) -> String { - let me = self.as_slice(); - let mut out = String::with_capacity(me.len()); - for c in me.chars() { - for c in c.escape_unicode() { - out.push(c); - } - } - out + self.chars().flat_map(|c| c.escape_unicode()).collect() } /// Replaces all occurrences of one string with another. @@ -730,25 +710,31 @@ pub trait StrAllocating: Str { /// // not found, so no change. /// assert_eq!(s.replace("cookie monster", "little lamb"), s); /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] fn replace(&self, from: &str, to: &str) -> String { - replace(self.as_slice(), from, to) + let mut result = String::new(); + let mut last_end = 0; + for (start, end) in self.match_indices(from) { + result.push_str(unsafe { self.slice_unchecked(last_end, start) }); + result.push_str(to); + last_end = end; + } + result.push_str(unsafe { self.slice_unchecked(last_end, self.len()) }); + result } /// Given a string, makes a new string with repeated copies of it. + #[deprecated = "user repeat(self).take(n).collect() instead"] fn repeat(&self, nn: uint) -> String { - let me = self.as_slice(); - let mut ret = String::with_capacity(nn * me.len()); - for _ in range(0, nn) { - ret.push_str(me); - } - ret + iter::repeat(self[]).take(nn).collect() } /// Returns the Levenshtein Distance between two strings. + #[deprecated = "this function will be removed"] fn lev_distance(&self, t: &str) -> uint { - let me = self.as_slice(); - if me.is_empty() { return t.char_len(); } - if t.is_empty() { return me.char_len(); } + let me = self[]; + if me.is_empty() { return t.chars().count(); } + if t.is_empty() { return me.chars().count(); } let mut dcol = Vec::from_fn(t.len() + 1, |x| x); let mut t_last = 0; @@ -780,9 +766,10 @@ pub trait StrAllocating: Str { /// Returns an iterator over the string in Unicode Normalization Form D /// (canonical decomposition). #[inline] + #[unstable = "this functionality may only be provided by libunicode"] fn nfd_chars<'a>(&'a self) -> Decompositions<'a> { Decompositions { - iter: self.as_slice().chars(), + iter: self[].chars(), buffer: Vec::new(), sorted: false, kind: Canonical @@ -792,9 +779,10 @@ pub trait StrAllocating: Str { /// Returns an iterator over the string in Unicode Normalization Form KD /// (compatibility decomposition). #[inline] + #[unstable = "this functionality may only be provided by libunicode"] fn nfkd_chars<'a>(&'a self) -> Decompositions<'a> { Decompositions { - iter: self.as_slice().chars(), + iter: self[].chars(), buffer: Vec::new(), sorted: false, kind: Compatible @@ -804,6 +792,7 @@ pub trait StrAllocating: Str { /// An Iterator over the string in Unicode Normalization Form C /// (canonical decomposition followed by canonical composition). #[inline] + #[unstable = "this functionality may only be provided by libunicode"] fn nfc_chars<'a>(&'a self) -> Recompositions<'a> { Recompositions { iter: self.nfd_chars(), @@ -817,6 +806,7 @@ pub trait StrAllocating: Str { /// An Iterator over the string in Unicode Normalization Form KC /// (compatibility decomposition followed by canonical composition). #[inline] + #[unstable = "this functionality may only be provided by libunicode"] fn nfkc_chars<'a>(&'a self) -> Recompositions<'a> { Recompositions { iter: self.nfkd_chars(), @@ -826,15 +816,912 @@ pub trait StrAllocating: Str { last_ccc: None } } -} -impl<'a> StrAllocating for &'a str { + /// Returns true if one string contains another + /// + /// # Arguments + /// + /// - needle - The string to look for + /// + /// # Example + /// + /// ```rust + /// assert!("bananas".contains("nana")); + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn contains(&self, needle: &str) -> bool { + core_str::StrExt::contains(self[], needle) + } + + /// Returns true if a string contains a char. + /// + /// # Arguments + /// + /// - needle - The char to look for + /// + /// # Example + /// + /// ```rust + /// assert!("hello".contains_char('e')); + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn contains_char(&self, needle: char) -> bool { + core_str::StrExt::contains_char(self[], needle) + } + + /// An iterator over the characters of `self`. Note, this iterates + /// over Unicode code-points, not Unicode graphemes. + /// + /// # Example + /// + /// ```rust + /// let v: Vec = "abc åäö".chars().collect(); + /// assert_eq!(v, vec!['a', 'b', 'c', ' ', 'å', 'ä', 'ö']); + /// ``` + #[stable] + fn chars(&self) -> Chars { + core_str::StrExt::chars(self[]) + } + + /// An iterator over the bytes of `self` + /// + /// # Example + /// + /// ```rust + /// let v: Vec = "bors".bytes().collect(); + /// assert_eq!(v, b"bors".to_vec()); + /// ``` + #[stable] + fn bytes(&self) -> Bytes { + core_str::StrExt::bytes(self[]) + } + + /// An iterator over the characters of `self` and their byte offsets. + #[stable] + fn char_indices(&self) -> CharIndices { + core_str::StrExt::char_indices(self[]) + } + + /// An iterator over substrings of `self`, separated by characters + /// matched by `sep`. + /// + /// # Example + /// + /// ```rust + /// let v: Vec<&str> = "Mary had a little lamb".split(' ').collect(); + /// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]); + /// + /// let v: Vec<&str> = "abc1def2ghi".split(|c: char| c.is_numeric()).collect(); + /// assert_eq!(v, vec!["abc", "def", "ghi"]); + /// + /// let v: Vec<&str> = "lionXXtigerXleopard".split('X').collect(); + /// assert_eq!(v, vec!["lion", "", "tiger", "leopard"]); + /// + /// let v: Vec<&str> = "".split('X').collect(); + /// assert_eq!(v, vec![""]); + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn split(&self, sep: Sep) -> CharSplits { + core_str::StrExt::split(self[], sep) + } + + /// An iterator over substrings of `self`, separated by characters + /// matched by `sep`, restricted to splitting at most `count` + /// times. + /// + /// # Example + /// + /// ```rust + /// let v: Vec<&str> = "Mary had a little lambda".splitn(2, ' ').collect(); + /// assert_eq!(v, vec!["Mary", "had", "a little lambda"]); + /// + /// let v: Vec<&str> = "abc1def2ghi".splitn(1, |c: char| c.is_numeric()).collect(); + /// assert_eq!(v, vec!["abc", "def2ghi"]); + /// + /// let v: Vec<&str> = "lionXXtigerXleopard".splitn(2, 'X').collect(); + /// assert_eq!(v, vec!["lion", "", "tigerXleopard"]); + /// + /// let v: Vec<&str> = "abcXdef".splitn(0, 'X').collect(); + /// assert_eq!(v, vec!["abcXdef"]); + /// + /// let v: Vec<&str> = "".splitn(1, 'X').collect(); + /// assert_eq!(v, vec![""]); + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn splitn(&self, count: uint, sep: Sep) -> CharSplitsN { + core_str::StrExt::splitn(self[], count, sep) + } + + /// An iterator over substrings of `self`, separated by characters + /// matched by `sep`. + /// + /// Equivalent to `split`, except that the trailing substring + /// is skipped if empty (terminator semantics). + /// + /// # Example + /// + /// ```rust + /// let v: Vec<&str> = "A.B.".split_terminator('.').collect(); + /// assert_eq!(v, vec!["A", "B"]); + /// + /// let v: Vec<&str> = "A..B..".split_terminator('.').collect(); + /// assert_eq!(v, vec!["A", "", "B", ""]); + /// + /// let v: Vec<&str> = "Mary had a little lamb".split(' ').rev().collect(); + /// assert_eq!(v, vec!["lamb", "little", "a", "had", "Mary"]); + /// + /// let v: Vec<&str> = "abc1def2ghi".split(|c: char| c.is_numeric()).rev().collect(); + /// assert_eq!(v, vec!["ghi", "def", "abc"]); + /// + /// let v: Vec<&str> = "lionXXtigerXleopard".split('X').rev().collect(); + /// assert_eq!(v, vec!["leopard", "tiger", "", "lion"]); + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn split_terminator(&self, sep: Sep) -> CharSplits { + core_str::StrExt::split_terminator(self[], sep) + } + + /// An iterator over substrings of `self`, separated by characters + /// matched by `sep`, starting from the end of the string. + /// Restricted to splitting at most `count` times. + /// + /// # Example + /// + /// ```rust + /// let v: Vec<&str> = "Mary had a little lamb".rsplitn(2, ' ').collect(); + /// assert_eq!(v, vec!["lamb", "little", "Mary had a"]); + /// + /// let v: Vec<&str> = "abc1def2ghi".rsplitn(1, |c: char| c.is_numeric()).collect(); + /// assert_eq!(v, vec!["ghi", "abc1def"]); + /// + /// let v: Vec<&str> = "lionXXtigerXleopard".rsplitn(2, 'X').collect(); + /// assert_eq!(v, vec!["leopard", "tiger", "lionX"]); + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn rsplitn(&self, count: uint, sep: Sep) -> CharSplitsN { + core_str::StrExt::rsplitn(self[], count, sep) + } + + /// An iterator over the start and end indices of the disjoint + /// matches of `sep` within `self`. + /// + /// That is, each returned value `(start, end)` satisfies + /// `self.slice(start, end) == sep`. For matches of `sep` within + /// `self` that overlap, only the indices corresponding to the + /// first match are returned. + /// + /// # Example + /// + /// ```rust + /// let v: Vec<(uint, uint)> = "abcXXXabcYYYabc".match_indices("abc").collect(); + /// assert_eq!(v, vec![(0,3), (6,9), (12,15)]); + /// + /// let v: Vec<(uint, uint)> = "1abcabc2".match_indices("abc").collect(); + /// assert_eq!(v, vec![(1,4), (4,7)]); + /// + /// let v: Vec<(uint, uint)> = "ababa".match_indices("aba").collect(); + /// assert_eq!(v, vec![(0, 3)]); // only the first `aba` + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn match_indices<'a>(&'a self, sep: &'a str) -> MatchIndices<'a> { + core_str::StrExt::match_indices(self[], sep) + } + + /// An iterator over the substrings of `self` separated by `sep`. + /// + /// # Example + /// + /// ```rust + /// let v: Vec<&str> = "abcXXXabcYYYabc".split_str("abc").collect(); + /// assert_eq!(v, vec!["", "XXX", "YYY", ""]); + /// + /// let v: Vec<&str> = "1abcabc2".split_str("abc").collect(); + /// assert_eq!(v, vec!["1", "", "2"]); + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn split_str<'a>(&'a self, s: &'a str) -> StrSplits<'a> { + core_str::StrExt::split_str(self[], s) + } + + /// An iterator over the lines of a string (subsequences separated + /// by `\n`). This does not include the empty string after a + /// trailing `\n`. + /// + /// # Example + /// + /// ```rust + /// let four_lines = "foo\nbar\n\nbaz\n"; + /// let v: Vec<&str> = four_lines.lines().collect(); + /// assert_eq!(v, vec!["foo", "bar", "", "baz"]); + /// ``` + #[stable] + fn lines(&self) -> Lines { + core_str::StrExt::lines(self[]) + } + + /// An iterator over the lines of a string, separated by either + /// `\n` or `\r\n`. As with `.lines()`, this does not include an + /// empty trailing line. + /// + /// # Example + /// + /// ```rust + /// let four_lines = "foo\r\nbar\n\r\nbaz\n"; + /// let v: Vec<&str> = four_lines.lines_any().collect(); + /// assert_eq!(v, vec!["foo", "bar", "", "baz"]); + /// ``` + #[stable] + fn lines_any(&self) -> LinesAny { + core_str::StrExt::lines_any(self[]) + } + + /// Returns the number of Unicode code points (`char`) that a + /// string holds. + /// + /// This does not perform any normalization, and is `O(n)`, since + /// UTF-8 is a variable width encoding of code points. + /// + /// *Warning*: The number of code points in a string does not directly + /// correspond to the number of visible characters or width of the + /// visible text due to composing characters, and double- and + /// zero-width ones. + /// + /// See also `.len()` for the byte length. + /// + /// # Example + /// + /// ```rust + /// // composed forms of `ö` and `é` + /// let c = "Löwe 老虎 Léopard"; // German, Simplified Chinese, French + /// // decomposed forms of `ö` and `é` + /// let d = "Lo\u0308we 老虎 Le\u0301opard"; + /// + /// assert_eq!(c.char_len(), 15); + /// assert_eq!(d.char_len(), 17); + /// + /// assert_eq!(c.len(), 21); + /// assert_eq!(d.len(), 23); + /// + /// // the two strings *look* the same + /// println!("{}", c); + /// println!("{}", d); + /// ``` + #[deprecated = "call .chars().count() instead"] + fn char_len(&self) -> uint { + core_str::StrExt::char_len(self[]) + } + + /// Returns a slice of the given string from the byte range + /// [`begin`..`end`). + /// + /// This operation is `O(1)`. + /// + /// Panics when `begin` and `end` do not point to valid characters + /// or point beyond the last character of the string. + /// + /// See also `slice_to` and `slice_from` for slicing prefixes and + /// suffixes of strings, and `slice_chars` for slicing based on + /// code point counts. + /// + /// # Example + /// + /// ```rust + /// let s = "Löwe 老虎 Léopard"; + /// assert_eq!(s.slice(0, 1), "L"); + /// + /// assert_eq!(s.slice(1, 9), "öwe 老"); + /// + /// // these will panic: + /// // byte 2 lies within `ö`: + /// // s.slice(2, 3); + /// + /// // byte 8 lies within `老` + /// // s.slice(1, 8); + /// + /// // byte 100 is outside the string + /// // s.slice(3, 100); + /// ``` + #[unstable = "use slice notation [a..b] instead"] + fn slice(&self, begin: uint, end: uint) -> &str { + core_str::StrExt::slice(self[], begin, end) + } + + /// Returns a slice of the string from `begin` to its end. + /// + /// Equivalent to `self.slice(begin, self.len())`. + /// + /// Panics when `begin` does not point to a valid character, or is + /// out of bounds. + /// + /// See also `slice`, `slice_to` and `slice_chars`. + #[unstable = "use slice notation [a..] instead"] + fn slice_from(&self, begin: uint) -> &str { + core_str::StrExt::slice_from(self[], begin) + } + + /// Returns a slice of the string from the beginning to byte + /// `end`. + /// + /// Equivalent to `self.slice(0, end)`. + /// + /// Panics when `end` does not point to a valid character, or is + /// out of bounds. + /// + /// See also `slice`, `slice_from` and `slice_chars`. + #[unstable = "use slice notation [0..a] instead"] + fn slice_to(&self, end: uint) -> &str { + core_str::StrExt::slice_to(self[], end) + } + + /// Returns a slice of the string from the character range + /// [`begin`..`end`). + /// + /// That is, start at the `begin`-th code point of the string and + /// continue to the `end`-th code point. This does not detect or + /// handle edge cases such as leaving a combining character as the + /// first code point of the string. + /// + /// Due to the design of UTF-8, this operation is `O(end)`. + /// See `slice`, `slice_to` and `slice_from` for `O(1)` + /// variants that use byte indices rather than code point + /// indices. + /// + /// Panics if `begin` > `end` or the either `begin` or `end` are + /// beyond the last character of the string. + /// + /// # Example + /// + /// ```rust + /// let s = "Löwe 老虎 Léopard"; + /// assert_eq!(s.slice_chars(0, 4), "Löwe"); + /// assert_eq!(s.slice_chars(5, 7), "老虎"); + /// ``` + #[unstable = "may have yet to prove its worth"] + fn slice_chars(&self, begin: uint, end: uint) -> &str { + core_str::StrExt::slice_chars(self[], begin, end) + } + + /// Takes a bytewise (not UTF-8) slice from a string. + /// + /// Returns the substring from [`begin`..`end`). + /// + /// Caller must check both UTF-8 character boundaries and the boundaries of + /// the entire slice as well. + #[stable] + unsafe fn slice_unchecked(&self, begin: uint, end: uint) -> &str { + core_str::StrExt::slice_unchecked(self[], begin, end) + } + + /// Returns true if `needle` is a prefix of the string. + /// + /// # Example + /// + /// ```rust + /// assert!("banana".starts_with("ba")); + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn starts_with(&self, needle: &str) -> bool { + core_str::StrExt::starts_with(self[], needle) + } + + /// Returns true if `needle` is a suffix of the string. + /// + /// # Example + /// + /// ```rust + /// assert!("banana".ends_with("nana")); + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn ends_with(&self, needle: &str) -> bool { + core_str::StrExt::ends_with(self[], needle) + } + + /// Returns a string with characters that match `to_trim` removed from the left and the right. + /// + /// # Arguments + /// + /// * to_trim - a character matcher + /// + /// # Example + /// + /// ```rust + /// assert_eq!("11foo1bar11".trim_chars('1'), "foo1bar") + /// let x: &[_] = &['1', '2']; + /// assert_eq!("12foo1bar12".trim_chars(x), "foo1bar") + /// assert_eq!("123foo1bar123".trim_chars(|c: char| c.is_numeric()), "foo1bar") + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn trim_chars(&self, to_trim: C) -> &str { + core_str::StrExt::trim_chars(self[], to_trim) + } + + /// Returns a string with leading `chars_to_trim` removed. + /// + /// # Arguments + /// + /// * to_trim - a character matcher + /// + /// # Example + /// + /// ```rust + /// assert_eq!("11foo1bar11".trim_left_chars('1'), "foo1bar11") + /// let x: &[_] = &['1', '2']; + /// assert_eq!("12foo1bar12".trim_left_chars(x), "foo1bar12") + /// assert_eq!("123foo1bar123".trim_left_chars(|c: char| c.is_numeric()), "foo1bar123") + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn trim_left_chars(&self, to_trim: C) -> &str { + core_str::StrExt::trim_left_chars(self[], to_trim) + } + + /// Returns a string with trailing `chars_to_trim` removed. + /// + /// # Arguments + /// + /// * to_trim - a character matcher + /// + /// # Example + /// + /// ```rust + /// assert_eq!("11foo1bar11".trim_right_chars('1'), "11foo1bar") + /// let x: &[_] = &['1', '2']; + /// assert_eq!("12foo1bar12".trim_right_chars(x), "12foo1bar") + /// assert_eq!("123foo1bar123".trim_right_chars(|c: char| c.is_numeric()), "123foo1bar") + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn trim_right_chars(&self, to_trim: C) -> &str { + core_str::StrExt::trim_right_chars(self[], to_trim) + } + + /// Check that `index`-th byte lies at the start and/or end of a + /// UTF-8 code point sequence. + /// + /// The start and end of the string (when `index == self.len()`) + /// are considered to be boundaries. + /// + /// Panics if `index` is greater than `self.len()`. + /// + /// # Example + /// + /// ```rust + /// let s = "Löwe 老虎 Léopard"; + /// assert!(s.is_char_boundary(0)); + /// // start of `老` + /// assert!(s.is_char_boundary(6)); + /// assert!(s.is_char_boundary(s.len())); + /// + /// // second byte of `ö` + /// assert!(!s.is_char_boundary(2)); + /// + /// // third byte of `老` + /// assert!(!s.is_char_boundary(8)); + /// ``` + #[unstable = "naming is uncertain with container conventions"] + fn is_char_boundary(&self, index: uint) -> bool { + core_str::StrExt::is_char_boundary(self[], index) + } + + /// Pluck a character out of a string and return the index of the next + /// character. + /// + /// This function can be used to iterate over the Unicode characters of a + /// string. + /// + /// # Example + /// + /// This example manually iterates through the characters of a + /// string; this should normally be done by `.chars()` or + /// `.char_indices`. + /// + /// ```rust + /// use std::str::CharRange; + /// + /// let s = "中华Việt Nam"; + /// let mut i = 0u; + /// while i < s.len() { + /// let CharRange {ch, next} = s.char_range_at(i); + /// println!("{}: {}", i, ch); + /// i = next; + /// } + /// ``` + /// + /// This outputs: + /// + /// ```text + /// 0: 中 + /// 3: 华 + /// 6: V + /// 7: i + /// 8: ệ + /// 11: t + /// 12: + /// 13: N + /// 14: a + /// 15: m + /// ``` + /// + /// # Arguments + /// + /// * s - The string + /// * i - The byte offset of the char to extract + /// + /// # Return value + /// + /// A record {ch: char, next: uint} containing the char value and the byte + /// index of the next Unicode character. + /// + /// # Panics + /// + /// If `i` is greater than or equal to the length of the string. + /// If `i` is not the index of the beginning of a valid UTF-8 character. + #[unstable = "naming is uncertain with container conventions"] + fn char_range_at(&self, start: uint) -> CharRange { + core_str::StrExt::char_range_at(self[], start) + } + + /// Given a byte position and a str, return the previous char and its position. + /// + /// This function can be used to iterate over a Unicode string in reverse. + /// + /// Returns 0 for next index if called on start index 0. + /// + /// # Panics + /// + /// If `i` is greater than the length of the string. + /// If `i` is not an index following a valid UTF-8 character. + #[unstable = "naming is uncertain with container conventions"] + fn char_range_at_reverse(&self, start: uint) -> CharRange { + core_str::StrExt::char_range_at_reverse(self[], start) + } + + /// Plucks the character starting at the `i`th byte of a string. + /// + /// # Example + /// + /// ```rust + /// let s = "abπc"; + /// assert_eq!(s.char_at(1), 'b'); + /// assert_eq!(s.char_at(2), 'π'); + /// assert_eq!(s.char_at(4), 'c'); + /// ``` + /// + /// # Panics + /// + /// If `i` is greater than or equal to the length of the string. + /// If `i` is not the index of the beginning of a valid UTF-8 character. + #[unstable = "naming is uncertain with container conventions"] + fn char_at(&self, i: uint) -> char { + core_str::StrExt::char_at(self[], i) + } + + /// Plucks the character ending at the `i`th byte of a string. + /// + /// # Panics + /// + /// If `i` is greater than the length of the string. + /// If `i` is not an index following a valid UTF-8 character. + #[unstable = "naming is uncertain with container conventions"] + fn char_at_reverse(&self, i: uint) -> char { + core_str::StrExt::char_at_reverse(self[], i) + } + + /// Work with the byte buffer of a string as a byte slice. + /// + /// # Example + /// + /// ```rust + /// assert_eq!("bors".as_bytes(), b"bors"); + /// ``` + #[stable] + fn as_bytes(&self) -> &[u8] { + core_str::StrExt::as_bytes(self[]) + } + + /// Returns the byte index of the first character of `self` that + /// matches `search`. + /// + /// # Return value + /// + /// `Some` containing the byte index of the last matching character + /// or `None` if there is no match + /// + /// # Example + /// + /// ```rust + /// let s = "Löwe 老虎 Léopard"; + /// + /// assert_eq!(s.find('L'), Some(0)); + /// assert_eq!(s.find('é'), Some(14)); + /// + /// // the first space + /// assert_eq!(s.find(|c: char| c.is_whitespace()), Some(5)); + /// + /// // neither are found + /// let x: &[_] = &['1', '2']; + /// assert_eq!(s.find(x), None); + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn find(&self, search: C) -> Option { + core_str::StrExt::find(self[], search) + } + + /// Returns the byte index of the last character of `self` that + /// matches `search`. + /// + /// # Return value + /// + /// `Some` containing the byte index of the last matching character + /// or `None` if there is no match. + /// + /// # Example + /// + /// ```rust + /// let s = "Löwe 老虎 Léopard"; + /// + /// assert_eq!(s.rfind('L'), Some(13)); + /// assert_eq!(s.rfind('é'), Some(14)); + /// + /// // the second space + /// assert_eq!(s.rfind(|c: char| c.is_whitespace()), Some(12)); + /// + /// // searches for an occurrence of either `1` or `2`, but neither are found + /// let x: &[_] = &['1', '2']; + /// assert_eq!(s.rfind(x), None); + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn rfind(&self, search: C) -> Option { + core_str::StrExt::rfind(self[], search) + } + + /// Returns the byte index of the first matching substring + /// + /// # Arguments + /// + /// * `needle` - The string to search for + /// + /// # Return value + /// + /// `Some` containing the byte index of the first matching substring + /// or `None` if there is no match. + /// + /// # Example + /// + /// ```rust + /// let s = "Löwe 老虎 Léopard"; + /// + /// assert_eq!(s.find_str("老虎 L"), Some(6)); + /// assert_eq!(s.find_str("muffin man"), None); + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn find_str(&self, needle: &str) -> Option { + core_str::StrExt::find_str(self[], needle) + } + + /// Retrieves the first character from a string slice and returns + /// it. This does not allocate a new string; instead, it returns a + /// slice that point one character beyond the character that was + /// shifted. If the string does not contain any characters, + /// None is returned instead. + /// + /// # Example + /// + /// ```rust + /// let s = "Löwe 老虎 Léopard"; + /// let (c, s1) = s.slice_shift_char().unwrap(); + /// assert_eq!(c, 'L'); + /// assert_eq!(s1, "öwe 老虎 Léopard"); + /// + /// let (c, s2) = s1.slice_shift_char().unwrap(); + /// assert_eq!(c, 'ö'); + /// assert_eq!(s2, "we 老虎 Léopard"); + /// ``` + #[unstable = "awaiting conventions about shifting and slices"] + fn slice_shift_char(&self) -> Option<(char, &str)> { + core_str::StrExt::slice_shift_char(self[]) + } + + /// Returns the byte offset of an inner slice relative to an enclosing outer slice. + /// + /// Panics if `inner` is not a direct slice contained within self. + /// + /// # Example + /// + /// ```rust + /// let string = "a\nb\nc"; + /// let lines: Vec<&str> = string.lines().collect(); + /// + /// assert!(string.subslice_offset(lines[0]) == 0); // &"a" + /// assert!(string.subslice_offset(lines[1]) == 2); // &"b" + /// assert!(string.subslice_offset(lines[2]) == 4); // &"c" + /// ``` + #[unstable = "awaiting pattern/matcher stabilization"] + fn subslice_offset(&self, inner: &str) -> uint { + core_str::StrExt::subslice_offset(self[], inner) + } + + /// Return an unsafe pointer to the strings buffer. + /// + /// The caller must ensure that the string outlives this pointer, + /// and that it is not reallocated (e.g. by pushing to the + /// string). + #[stable] #[inline] - fn into_string(self) -> String { - String::from_str(self) + fn as_ptr(&self) -> *const u8 { + core_str::StrExt::as_ptr(self[]) + } + + /// Return an iterator of `u16` over the string encoded as UTF-16. + #[unstable = "this functionality may only be provided by libunicode"] + fn utf16_units(&self) -> Utf16Units { + Utf16Units { encoder: Utf16Encoder::new(self[].chars()) } + } + + /// Return the number of bytes in this string + /// + /// # Example + /// + /// ``` + /// assert_eq!("foo".len(), 3); + /// assert_eq!("ƒoo".len(), 4); + /// ``` + #[stable] + #[inline] + fn len(&self) -> uint { + core_str::StrExt::len(self[]) + } + + /// Returns true if this slice contains no bytes + /// + /// # Example + /// + /// ``` + /// assert!("".is_empty()); + /// ``` + #[inline] + #[stable] + fn is_empty(&self) -> bool { + core_str::StrExt::is_empty(self[]) + } + + /// Parse this string into the specified type. + /// + /// # Example + /// + /// ``` + /// assert_eq!("4".parse::(), Some(4)); + /// assert_eq!("j".parse::(), None); + /// ``` + #[inline] + #[unstable = "this method was just created"] + fn parse(&self) -> Option { + FromStr::from_str(self[]) + } + + /// Returns an iterator over the + /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) + /// of the string. + /// + /// If `is_extended` is true, the iterator is over the *extended grapheme clusters*; + /// otherwise, the iterator is over the *legacy grapheme clusters*. + /// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) + /// recommends extended grapheme cluster boundaries for general processing. + /// + /// # Example + /// + /// ```rust + /// let gr1 = "a\u0310e\u0301o\u0308\u0332".graphemes(true).collect::>(); + /// let b: &[_] = &["a\u0310", "e\u0301", "o\u0308\u0332"]; + /// assert_eq!(gr1.as_slice(), b); + /// let gr2 = "a\r\nb🇷🇺🇸🇹".graphemes(true).collect::>(); + /// let b: &[_] = &["a", "\r\n", "b", "🇷🇺🇸🇹"]; + /// assert_eq!(gr2.as_slice(), b); + /// ``` + #[unstable = "this functionality may only be provided by libunicode"] + fn graphemes(&self, is_extended: bool) -> Graphemes { + UnicodeStr::graphemes(self[], is_extended) + } + + /// Returns an iterator over the grapheme clusters of self and their byte offsets. + /// See `graphemes()` method for more information. + /// + /// # Example + /// + /// ```rust + /// let gr_inds = "a̐éö̲\r\n".grapheme_indices(true).collect::>(); + /// let b: &[_] = &[(0u, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]; + /// assert_eq!(gr_inds.as_slice(), b); + /// ``` + #[unstable = "this functionality may only be provided by libunicode"] + fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices { + UnicodeStr::grapheme_indices(self[], is_extended) + } + + /// An iterator over the words of a string (subsequences separated + /// by any sequence of whitespace). Sequences of whitespace are + /// collapsed, so empty "words" are not included. + /// + /// # Example + /// + /// ```rust + /// let some_words = " Mary had\ta little \n\t lamb"; + /// let v: Vec<&str> = some_words.words().collect(); + /// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]); + /// ``` + #[stable] + fn words(&self) -> Words { + UnicodeStr::words(self[]) + } + + /// Returns true if the string contains only whitespace. + /// + /// Whitespace characters are determined by `char::is_whitespace`. + /// + /// # Example + /// + /// ```rust + /// assert!(" \t\n".is_whitespace()); + /// assert!("".is_whitespace()); + /// + /// assert!( !"abc".is_whitespace()); + /// ``` + #[deprecated = "use .chars().all(|c| c.is_whitespace())"] + fn is_whitespace(&self) -> bool { + UnicodeStr::is_whitespace(self[]) + } + + /// Returns true if the string contains only alphanumeric code + /// points. + /// + /// Alphanumeric characters are determined by `char::is_alphanumeric`. + /// + /// # Example + /// + /// ```rust + /// assert!("Löwe老虎Léopard123".is_alphanumeric()); + /// assert!("".is_alphanumeric()); + /// + /// assert!( !" &*~".is_alphanumeric()); + /// ``` + #[deprecated = "use .chars().all(|c| c.is_alphanumeric())"] + fn is_alphanumeric(&self) -> bool { + UnicodeStr::is_alphanumeric(self[]) + } + + /// Returns a string's displayed width in columns, treating control + /// characters as zero-width. + /// + /// `is_cjk` determines behavior for characters in the Ambiguous category: + /// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1. + /// In CJK locales, `is_cjk` should be `true`, else it should be `false`. + /// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) + /// recommends that these characters be treated as 1 column (i.e., + /// `is_cjk` = `false`) if the locale is unknown. + #[unstable = "this functionality may only be provided by libunicode"] + fn width(&self, is_cjk: bool) -> uint { + UnicodeStr::width(self[], is_cjk) + } + + /// Returns a string with leading and trailing whitespace removed. + #[stable] + fn trim(&self) -> &str { + UnicodeStr::trim(self[]) + } + + /// Returns a string with leading whitespace removed. + #[stable] + fn trim_left(&self) -> &str { + UnicodeStr::trim_left(self[]) + } + + /// Returns a string with trailing whitespace removed. + #[stable] + fn trim_right(&self) -> &str { + UnicodeStr::trim_right(self[]) } } +impl StrExt for str {} + #[cfg(test)] mod tests { use prelude::*; @@ -1541,28 +2428,6 @@ mod tests { assert!(!"".contains_char('a')); } - #[test] - fn test_truncate_utf16_at_nul() { - let v = []; - let b: &[u16] = &[]; - assert_eq!(truncate_utf16_at_nul(&v), b); - - let v = [0, 2, 3]; - assert_eq!(truncate_utf16_at_nul(&v), b); - - let v = [1, 0, 3]; - let b: &[u16] = &[1]; - assert_eq!(truncate_utf16_at_nul(&v), b); - - let v = [1, 2, 0]; - let b: &[u16] = &[1, 2]; - assert_eq!(truncate_utf16_at_nul(&v), b); - - let v = [1, 2, 3]; - let b: &[u16] = &[1, 2, 3]; - assert_eq!(truncate_utf16_at_nul(&v), b); - } - #[test] fn test_char_at() { let s = "ศไทย中华Việt Nam"; @@ -1814,27 +2679,6 @@ mod tests { assert_eq!(words, vec!["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"]) } - #[test] - fn test_lev_distance() { - use core::char::{ from_u32, MAX }; - // Test bytelength agnosticity - for c in range(0u32, MAX as u32) - .filter_map(|i| from_u32(i)) - .map(|i| String::from_char(1, i)) { - assert_eq!(c[].lev_distance(c[]), 0); - } - - let a = "\nMäry häd ä little lämb\n\nLittle lämb\n"; - let b = "\nMary häd ä little lämb\n\nLittle lämb\n"; - let c = "Mary häd ä little lämb\n\nLittle lämb\n"; - assert_eq!(a.lev_distance(b), 1); - assert_eq!(b.lev_distance(a), 1); - assert_eq!(a.lev_distance(c), 2); - assert_eq!(c.lev_distance(a), 2); - assert_eq!(b.lev_distance(c), 1); - assert_eq!(c.lev_distance(b), 1); - } - #[test] fn test_nfd_chars() { macro_rules! t { diff --git a/src/libcollections/string.rs b/src/libcollections/string.rs index db59424cedd..0e2b514d92d 100644 --- a/src/libcollections/string.rs +++ b/src/libcollections/string.rs @@ -21,13 +21,12 @@ use core::hash; use core::mem; use core::ptr; use core::ops; -// FIXME: ICE's abound if you import the `Slice` type while importing `Slice` trait use core::raw::Slice as RawSlice; +use unicode::str as unicode_str; +use unicode::str::Utf16Item; use slice::CloneSliceExt; -use str; -use str::{CharRange, CowString, FromStr, StrAllocating}; -use str::MaybeOwned::Owned; +use str::{mod, CharRange, FromStr, StrExt, Owned, Utf8Error}; use vec::{DerefVec, Vec, as_vec}; /// A growable string stored as a UTF-8 encoded buffer. @@ -87,8 +86,10 @@ impl String { /// Returns the vector as a string buffer, if possible, taking care not to /// copy it. /// - /// Returns `Err` with the original vector if the vector contains invalid - /// UTF-8. + /// # Failure + /// + /// If the given vector is not valid UTF-8, then the original vector and the + /// corresponding error is returned. /// /// # Examples /// @@ -103,11 +104,10 @@ impl String { /// ``` #[inline] #[unstable = "error type may change"] - pub fn from_utf8(vec: Vec) -> Result> { - if str::is_utf8(vec.as_slice()) { - Ok(String { vec: vec }) - } else { - Err(vec) + pub fn from_utf8(vec: Vec) -> Result, Utf8Error)> { + match str::from_utf8(vec.as_slice()) { + Ok(..) => Ok(String { vec: vec }), + Err(e) => Err((vec, e)) } } @@ -123,8 +123,9 @@ impl String { /// ``` #[unstable = "return type may change"] pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> CowString<'a> { - if str::is_utf8(v) { - return Cow::Borrowed(unsafe { mem::transmute(v) }) + match str::from_utf8(v) { + Ok(s) => return Cow::Borrowed(s), + Err(..) => {} } static TAG_CONT_U8: u8 = 128u8; @@ -173,7 +174,7 @@ impl String { if byte < 128u8 { // subseqidx handles this } else { - let w = str::utf8_char_width(byte); + let w = unicode_str::utf8_char_width(byte); match w { 2 => { @@ -235,7 +236,7 @@ impl String { res.as_mut_vec().push_all(v[subseqidx..total]) }; } - Cow::Owned(res.into_string()) + Cow::Owned(res) } /// Decode a UTF-16 encoded vector `v` into a `String`, returning `None` @@ -256,10 +257,10 @@ impl String { #[unstable = "error value in return may change"] pub fn from_utf16(v: &[u16]) -> Option { let mut s = String::with_capacity(v.len()); - for c in str::utf16_items(v) { + for c in unicode_str::utf16_items(v) { match c { - str::ScalarValue(c) => s.push(c), - str::LoneSurrogate(_) => return None + Utf16Item::ScalarValue(c) => s.push(c), + Utf16Item::LoneSurrogate(_) => return None } } Some(s) @@ -281,7 +282,7 @@ impl String { /// ``` #[stable] pub fn from_utf16_lossy(v: &[u16]) -> String { - str::utf16_items(v).map(|c| c.to_char_lossy()).collect() + unicode_str::utf16_items(v).map(|c| c.to_char_lossy()).collect() } /// Convert a vector of `char`s to a `String`. @@ -812,21 +813,12 @@ impl<'a, 'b> PartialEq> for &'b str { } #[experimental = "waiting on Str stabilization"] +#[allow(deprecated)] impl Str for String { #[inline] #[stable] fn as_slice<'a>(&'a self) -> &'a str { - unsafe { - mem::transmute(self.vec.as_slice()) - } - } -} - -#[experimental = "waiting on StrAllocating stabilization"] -impl StrAllocating for String { - #[inline] - fn into_string(self) -> String { - self + unsafe { mem::transmute(self.vec.as_slice()) } } } @@ -841,7 +833,7 @@ impl Default for String { #[experimental = "waiting on Show stabilization"] impl fmt::Show for String { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - self.as_slice().fmt(f) + (*self).fmt(f) } } @@ -849,7 +841,7 @@ impl fmt::Show for String { impl hash::Hash for String { #[inline] fn hash(&self, hasher: &mut H) { - self.as_slice().hash(hasher) + (*self).hash(hasher) } } @@ -873,7 +865,7 @@ impl<'a> Add<&'a str, String> for String { impl ops::Slice for String { #[inline] fn as_slice_<'a>(&'a self) -> &'a str { - self.as_slice() + unsafe { mem::transmute(self.vec.as_slice()) } } #[inline] @@ -894,7 +886,9 @@ impl ops::Slice for String { #[experimental = "waiting on Deref stabilization"] impl ops::Deref for String { - fn deref<'a>(&'a self) -> &'a str { self.as_slice() } + fn deref<'a>(&'a self) -> &'a str { + unsafe { mem::transmute(self.vec[]) } + } } /// Wrapper type providing a `&String` reference via `Deref`. @@ -1015,6 +1009,18 @@ pub mod raw { } } +/// A clone-on-write string +#[stable] +pub type CowString<'a> = Cow<'a, String, str>; + +#[allow(deprecated)] +impl<'a> Str for CowString<'a> { + #[inline] + fn as_slice<'b>(&'b self) -> &'b str { + (**self).as_slice() + } +} + #[cfg(test)] mod tests { use prelude::*; diff --git a/src/libcore/fmt/float.rs b/src/libcore/fmt/float.rs index d849bfa24c1..9ab450efd22 100644 --- a/src/libcore/fmt/float.rs +++ b/src/libcore/fmt/float.rs @@ -23,7 +23,7 @@ use num::cast; use ops::FnOnce; use result::Result::Ok; use slice::{mod, SliceExt}; -use str::StrPrelude; +use str::StrExt; /// A flag that specifies whether to use exponential (scientific) notation. pub enum ExponentFormat { diff --git a/src/libcore/fmt/mod.rs b/src/libcore/fmt/mod.rs index 79fb11f3854..29815e2fc85 100644 --- a/src/libcore/fmt/mod.rs +++ b/src/libcore/fmt/mod.rs @@ -24,7 +24,7 @@ use result::Result::{Ok, Err}; use result; use slice::SliceExt; use slice; -use str::StrPrelude; +use str::{StrExt, Utf8Error}; pub use self::num::radix; pub use self::num::Radix; @@ -795,5 +795,18 @@ impl<'b, T: Show> Show for RefMut<'b, T> { } } +impl Show for Utf8Error { + fn fmt(&self, f: &mut Formatter) -> Result { + match *self { + Utf8Error::InvalidByte(n) => { + write!(f, "invalid utf-8: invalid byte at index {}", n) + } + Utf8Error::TooShort => { + write!(f, "invalid utf-8: byte slice too short") + } + } + } +} + // If you expected tests to be here, look instead at the run-pass/ifmt.rs test, // it's a lot easier than creating all of the rt::Piece structures here. diff --git a/src/libcore/num/mod.rs b/src/libcore/num/mod.rs index 84d1d8e459a..60735879213 100644 --- a/src/libcore/num/mod.rs +++ b/src/libcore/num/mod.rs @@ -32,7 +32,7 @@ use ops::{Add, Sub, Mul, Div, Rem, Neg}; use ops::{Not, BitAnd, BitOr, BitXor, Shl, Shr}; use option::Option; use option::Option::{Some, None}; -use str::{FromStr, from_str, StrPrelude}; +use str::{FromStr, from_str, StrExt}; /// Simultaneous division and remainder #[inline] diff --git a/src/libcore/prelude.rs b/src/libcore/prelude.rs index ff3fc870beb..f6abc8da79c 100644 --- a/src/libcore/prelude.rs +++ b/src/libcore/prelude.rs @@ -60,7 +60,7 @@ pub use option::Option::{Some, None}; pub use ptr::RawPtr; pub use result::Result; pub use result::Result::{Ok, Err}; -pub use str::{Str, StrPrelude}; +pub use str::{Str, StrExt}; pub use tuple::{Tuple1, Tuple2, Tuple3, Tuple4}; pub use tuple::{Tuple5, Tuple6, Tuple7, Tuple8}; pub use tuple::{Tuple9, Tuple10, Tuple11, Tuple12}; diff --git a/src/libcore/str.rs b/src/libcore/str.rs index a89a7970ae9..60d4262a9b1 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -16,31 +16,30 @@ #![doc(primitive = "str")] -pub use self::Utf16Item::*; -pub use self::Searcher::{Naive, TwoWay, TwoWayLong}; +use self::Searcher::{Naive, TwoWay, TwoWayLong}; -use char::Char; -use char; +use char::{mod, Char}; use clone::Clone; -use cmp::{Eq, mod}; +use cmp::{mod, Eq}; use default::Default; -use iter::{Map, Iterator, IteratorExt, DoubleEndedIterator}; -use iter::{DoubleEndedIteratorExt, ExactSizeIterator}; use iter::range; -use kinds::Sized; +use iter::{DoubleEndedIteratorExt, ExactSizeIterator}; +use iter::{Map, Iterator, IteratorExt, DoubleEndedIterator}; +use kinds::{Copy, Sized}; use mem; use num::Int; -use option::Option; -use option::Option::{None, Some}; use ops::{Fn, FnMut}; +use option::Option::{mod, None, Some}; use ptr::RawPtr; use raw::{Repr, Slice}; +use result::Result::{mod, Ok, Err}; use slice::{mod, SliceExt}; use uint; /// A trait to abstract the idea of creating a new instance of a type from a /// string. -#[experimental = "might need to return Result"] +// FIXME(#17307): there should be an `E` associated type for a `Result` return +#[unstable = "will return a Result once associated types are working"] pub trait FromStr { /// Parses a string `s` to return an optional value of this type. If the /// string is ill-formatted, the None is returned. @@ -48,6 +47,7 @@ pub trait FromStr { } /// A utility function that just calls FromStr::from_str +#[deprecated = "call the .parse() method on the string instead"] pub fn from_str(s: &str) -> Option { FromStr::from_str(s) } @@ -78,22 +78,38 @@ impl FromStr for bool { Section: Creating a string */ -/// Converts a slice of bytes to a string slice without performing any allocations. +/// Errors which can occur when attempting to interpret a byte slice as a `str`. +pub enum Utf8Error { + /// An invalid byte was detected at the byte offset given. + /// + /// The offset is guaranteed to be in bounds of the slice in question, and + /// the byte at the specified offset was the first invalid byte in the + /// sequence detected. + InvalidByte(uint), + + /// The byte slice was invalid because more bytes were needed but no more + /// bytes were available. + TooShort, +} + +/// Converts a slice of bytes to a string slice without performing any +/// allocations. /// /// Once the slice has been validated as utf-8, it is transmuted in-place and /// returned as a '&str' instead of a '&[u8]' /// -/// Returns None if the slice is not utf-8. -pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a str> { - if is_utf8(v) { - Some(unsafe { from_utf8_unchecked(v) }) - } else { - None - } +/// # Failure +/// +/// Returns `Err` if the slice is not utf-8 with a description as to why the +/// provided slice is not utf-8. +pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { + try!(run_utf8_validation_iterator(&mut v.iter())); + Ok(unsafe { from_utf8_unchecked(v) }) } /// Converts a slice of bytes to a string slice without checking /// that the string contains valid UTF-8. +#[stable] pub unsafe fn from_utf8_unchecked<'a>(v: &'a [u8]) -> &'a str { mem::transmute(v) } @@ -111,6 +127,7 @@ pub unsafe fn from_utf8_unchecked<'a>(v: &'a [u8]) -> &'a str { /// # Panics /// /// This function will panic if the string pointed to by `s` is not valid UTF-8. +#[unstable = "may change location based on the outcome of the c_str module"] pub unsafe fn from_c_str(s: *const i8) -> &'static str { let s = s as *const u8; let mut len = 0u; @@ -118,10 +135,11 @@ pub unsafe fn from_c_str(s: *const i8) -> &'static str { len += 1u; } let v: &'static [u8] = ::mem::transmute(Slice { data: s, len: len }); - from_utf8(v).expect("from_c_str passed invalid utf-8 data") + from_utf8(v).ok().expect("from_c_str passed invalid utf-8 data") } /// Something that can be used to compare against a character +#[unstable = "definition may change as pattern-related methods are stabilized"] pub trait CharEq { /// Determine if the splitter should split at the given character fn matches(&mut self, char) -> bool; @@ -273,12 +291,12 @@ impl<'a> DoubleEndedIterator for Chars<'a> { /// External iterator for a string's characters and their byte offsets. /// Use with the `std::iter` module. #[deriving(Clone)] -pub struct CharOffsets<'a> { +pub struct CharIndices<'a> { front_offset: uint, iter: Chars<'a>, } -impl<'a> Iterator<(uint, char)> for CharOffsets<'a> { +impl<'a> Iterator<(uint, char)> for CharIndices<'a> { #[inline] fn next(&mut self) -> Option<(uint, char)> { let (pre_len, _) = self.iter.iter.size_hint(); @@ -299,7 +317,7 @@ impl<'a> Iterator<(uint, char)> for CharOffsets<'a> { } } -impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> { +impl<'a> DoubleEndedIterator<(uint, char)> for CharIndices<'a> { #[inline] fn next_back(&mut self) -> Option<(uint, char)> { match self.iter.next_back() { @@ -315,13 +333,15 @@ impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> { /// External iterator for a string's bytes. /// Use with the `std::iter` module. -pub type Bytes<'a> = Map<&'a u8, u8, slice::Items<'a, u8>, BytesFn>; +#[stable] +pub struct Bytes<'a> { + inner: Map<&'a u8, u8, slice::Items<'a, u8>, BytesFn>, +} /// A temporary new type wrapper that ensures that the `Bytes` iterator /// is cloneable. #[deriving(Copy)] -#[experimental = "iterator type instability"] -pub struct BytesFn(fn(&u8) -> u8); +struct BytesFn(fn(&u8) -> u8); impl<'a> Fn(&'a u8) -> u8 for BytesFn { extern "rust-call" fn call(&self, (ptr,): (&'a u8,)) -> u8 { @@ -355,8 +375,17 @@ pub struct CharSplitsN<'a, Sep> { invert: bool, } +/// An iterator over the lines of a string, separated by `\n`. +#[stable] +pub struct Lines<'a> { + inner: CharSplits<'a, char>, +} + /// An iterator over the lines of a string, separated by either `\n` or (`\r\n`). -pub type AnyLines<'a> = Map<&'a str, &'a str, CharSplits<'a, char>, fn(&str) -> &str>; +#[stable] +pub struct LinesAny<'a> { + inner: Map<&'a str, &'a str, Lines<'a>, fn(&str) -> &str>, +} impl<'a, Sep> CharSplits<'a, Sep> { #[inline] @@ -799,63 +828,6 @@ impl<'a> Iterator<&'a str> for StrSplits<'a> { } } -/// External iterator for a string's UTF16 codeunits. -/// Use with the `std::iter` module. -#[deriving(Clone)] -pub struct Utf16CodeUnits<'a> { - encoder: Utf16Encoder> -} - -impl<'a> Iterator for Utf16CodeUnits<'a> { - #[inline] - fn next(&mut self) -> Option { self.encoder.next() } - - #[inline] - fn size_hint(&self) -> (uint, Option) { self.encoder.size_hint() } -} - - -/// Iterator adaptor for encoding `char`s to UTF-16. -#[deriving(Clone)] -pub struct Utf16Encoder { - chars: I, - extra: u16 -} - -impl Utf16Encoder { - /// Create an UTF-16 encoder from any `char` iterator. - pub fn new(chars: I) -> Utf16Encoder where I: Iterator { - Utf16Encoder { chars: chars, extra: 0 } - } -} - -impl Iterator for Utf16Encoder where I: Iterator { - #[inline] - fn next(&mut self) -> Option { - if self.extra != 0 { - let tmp = self.extra; - self.extra = 0; - return Some(tmp); - } - - let mut buf = [0u16, ..2]; - self.chars.next().map(|ch| { - let n = ch.encode_utf16(buf[mut]).unwrap_or(0); - if n == 2 { self.extra = buf[1]; } - buf[0] - }) - } - - #[inline] - fn size_hint(&self) -> (uint, Option) { - let (low, high) = self.chars.size_hint(); - // every char gets either one u16 or two u16, - // so this iterator is between 1 or 2 times as - // long as the underlying iterator. - (low, high.and_then(|n| n.checked_mul(2))) - } -} - /* Section: Comparing strings */ @@ -880,7 +852,7 @@ fn eq_slice_(a: &str, b: &str) -> bool { /// to compare &[u8] byte slices that are not necessarily valid UTF-8. #[lang="str_eq"] #[inline] -pub fn eq_slice(a: &str, b: &str) -> bool { +fn eq_slice(a: &str, b: &str) -> bool { eq_slice_(a, b) } @@ -893,32 +865,37 @@ Section: Misc /// `iter` reset such that it is pointing at the first byte in the /// invalid sequence. #[inline(always)] -fn run_utf8_validation_iterator(iter: &mut slice::Items) -> bool { +fn run_utf8_validation_iterator(iter: &mut slice::Items) + -> Result<(), Utf8Error> { + let whole = iter.as_slice(); loop { // save the current thing we're pointing at. let old = *iter; // restore the iterator we had at the start of this codepoint. - macro_rules! err ( () => { {*iter = old; return false} }); + macro_rules! err (() => { { + *iter = old; + return Err(Utf8Error::InvalidByte(whole.len() - iter.as_slice().len())) + } }); macro_rules! next ( () => { - match iter.next() { - Some(a) => *a, - // we needed data, but there was none: error! - None => err!() - } - }); + match iter.next() { + Some(a) => *a, + // we needed data, but there was none: error! + None => return Err(Utf8Error::TooShort), + } + }); let first = match iter.next() { Some(&b) => b, // we're at the end of the iterator and a codepoint // boundary at the same time, so this string is valid. - None => return true + None => return Ok(()) }; // ASCII characters are always valid, so only large // bytes need more examination. if first >= 128 { - let w = utf8_char_width(first); + let w = UTF8_CHAR_WIDTH[first as uint] as uint; let second = next!(); // 2-byte encoding is for codepoints \u{0080} to \u{07ff} // first C2 80 last DF BF @@ -964,125 +941,9 @@ fn run_utf8_validation_iterator(iter: &mut slice::Items) -> bool { } /// Determines if a vector of bytes contains valid UTF-8. +#[deprecated = "call from_utf8 instead"] pub fn is_utf8(v: &[u8]) -> bool { - run_utf8_validation_iterator(&mut v.iter()) -} - -/// Determines if a vector of `u16` contains valid UTF-16 -pub fn is_utf16(v: &[u16]) -> bool { - let mut it = v.iter(); - macro_rules! next ( ($ret:expr) => { - match it.next() { Some(u) => *u, None => return $ret } - } - ); - loop { - let u = next!(true); - - match char::from_u32(u as u32) { - Some(_) => {} - None => { - let u2 = next!(false); - if u < 0xD7FF || u > 0xDBFF || - u2 < 0xDC00 || u2 > 0xDFFF { return false; } - } - } - } -} - -/// An iterator that decodes UTF-16 encoded codepoints from a vector -/// of `u16`s. -#[deriving(Clone)] -pub struct Utf16Items<'a> { - iter: slice::Items<'a, u16> -} -/// The possibilities for values decoded from a `u16` stream. -#[deriving(Copy, PartialEq, Eq, Clone, Show)] -pub enum Utf16Item { - /// A valid codepoint. - ScalarValue(char), - /// An invalid surrogate without its pair. - LoneSurrogate(u16) -} - -impl Utf16Item { - /// Convert `self` to a `char`, taking `LoneSurrogate`s to the - /// replacement character (U+FFFD). - #[inline] - pub fn to_char_lossy(&self) -> char { - match *self { - ScalarValue(c) => c, - LoneSurrogate(_) => '\u{FFFD}' - } - } -} - -impl<'a> Iterator for Utf16Items<'a> { - fn next(&mut self) -> Option { - let u = match self.iter.next() { - Some(u) => *u, - None => return None - }; - - if u < 0xD800 || 0xDFFF < u { - // not a surrogate - Some(ScalarValue(unsafe {mem::transmute(u as u32)})) - } else if u >= 0xDC00 { - // a trailing surrogate - Some(LoneSurrogate(u)) - } else { - // preserve state for rewinding. - let old = self.iter; - - let u2 = match self.iter.next() { - Some(u2) => *u2, - // eof - None => return Some(LoneSurrogate(u)) - }; - if u2 < 0xDC00 || u2 > 0xDFFF { - // not a trailing surrogate so we're not a valid - // surrogate pair, so rewind to redecode u2 next time. - self.iter = old; - return Some(LoneSurrogate(u)) - } - - // all ok, so lets decode it. - let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000; - Some(ScalarValue(unsafe {mem::transmute(c)})) - } - } - - #[inline] - fn size_hint(&self) -> (uint, Option) { - let (low, high) = self.iter.size_hint(); - // we could be entirely valid surrogates (2 elements per - // char), or entirely non-surrogates (1 element per char) - (low / 2, high) - } -} - -/// Create an iterator over the UTF-16 encoded codepoints in `v`, -/// returning invalid surrogates as `LoneSurrogate`s. -/// -/// # Example -/// -/// ```rust -/// use std::str; -/// use std::str::{ScalarValue, LoneSurrogate}; -/// -/// // 𝄞music -/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, -/// 0x0073, 0xDD1E, 0x0069, 0x0063, -/// 0xD834]; -/// -/// assert_eq!(str::utf16_items(&v).collect::>(), -/// vec![ScalarValue('𝄞'), -/// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'), -/// LoneSurrogate(0xDD1E), -/// ScalarValue('i'), ScalarValue('c'), -/// LoneSurrogate(0xD834)]); -/// ``` -pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> { - Utf16Items { iter : v.iter() } + run_utf8_validation_iterator(&mut v.iter()).is_ok() } /// Return a slice of `v` ending at (and not including) the first NUL @@ -1103,6 +964,7 @@ pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> { /// let b: &[_] = &['a' as u16, 'b' as u16]; /// assert_eq!(str::truncate_utf16_at_nul(&v), b); /// ``` +#[deprecated = "this function will be removed"] pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] { match v.iter().position(|c| *c == 0) { // don't include the 0 @@ -1133,6 +995,7 @@ static UTF8_CHAR_WIDTH: [u8, ..256] = [ /// Given a first byte, determine how many bytes are in this UTF-8 character #[inline] +#[deprecated = "this function has moved to libunicode"] pub fn utf8_char_width(b: u8) -> uint { return UTF8_CHAR_WIDTH[b as uint] as uint; } @@ -1141,6 +1004,7 @@ pub fn utf8_char_width(b: u8) -> uint { /// the next `char` in a string. This can be used as a data structure /// for iterating over the UTF-8 bytes of a string. #[deriving(Copy)] +#[unstable = "naming is uncertain with container conventions"] pub struct CharRange { /// Current `char` pub ch: char, @@ -1159,7 +1023,7 @@ pub mod raw { use ptr::RawPtr; use raw::Slice; use slice::SliceExt; - use str::{is_utf8, StrPrelude}; + use str::StrExt; /// Converts a slice of bytes to a string slice without checking /// that the string contains valid UTF-8. @@ -1181,8 +1045,7 @@ pub mod raw { curr = s.offset(len as int); } let v = Slice { data: s, len: len }; - assert!(is_utf8(::mem::transmute(v))); - ::mem::transmute(v) + super::from_utf8(::mem::transmute(v)).unwrap() } /// Takes a bytewise (not UTF-8) slice from a string. @@ -1225,7 +1088,7 @@ pub mod traits { use option::Option; use option::Option::Some; use ops; - use str::{Str, StrPrelude, eq_slice}; + use str::{Str, StrExt, eq_slice}; impl Ord for str { #[inline] @@ -1291,707 +1154,70 @@ pub mod traits { } /// Any string that can be represented as a slice +#[unstable = "Instead of taking this bound generically, this trait will be \ + replaced with one of slicing syntax, deref coercions, or \ + a more generic conversion trait"] pub trait Str for Sized? { /// Work with `self` as a slice. fn as_slice<'a>(&'a self) -> &'a str; } +#[allow(deprecated)] impl Str for str { #[inline] fn as_slice<'a>(&'a self) -> &'a str { self } } +#[allow(deprecated)] impl<'a, Sized? S> Str for &'a S where S: Str { #[inline] fn as_slice(&self) -> &str { Str::as_slice(*self) } } /// Methods for string slices -pub trait StrPrelude for Sized? { - /// Returns true if one string contains another - /// - /// # Arguments - /// - /// - needle - The string to look for - /// - /// # Example - /// - /// ```rust - /// assert!("bananas".contains("nana")); - /// ``` +#[allow(missing_docs)] +pub trait StrExt for Sized? { + // NB there are no docs here are they're all located on the StrExt trait in + // libcollections, not here. + fn contains(&self, needle: &str) -> bool; - - /// Returns true if a string contains a char. - /// - /// # Arguments - /// - /// - needle - The char to look for - /// - /// # Example - /// - /// ```rust - /// assert!("hello".contains_char('e')); - /// ``` fn contains_char(&self, needle: char) -> bool; - - /// An iterator over the characters of `self`. Note, this iterates - /// over Unicode code-points, not Unicode graphemes. - /// - /// # Example - /// - /// ```rust - /// let v: Vec = "abc åäö".chars().collect(); - /// assert_eq!(v, vec!['a', 'b', 'c', ' ', 'å', 'ä', 'ö']); - /// ``` fn chars<'a>(&'a self) -> Chars<'a>; - - /// An iterator over the bytes of `self` - /// - /// # Example - /// - /// ```rust - /// let v: Vec = "bors".bytes().collect(); - /// assert_eq!(v, b"bors".to_vec()); - /// ``` fn bytes<'a>(&'a self) -> Bytes<'a>; - - /// An iterator over the characters of `self` and their byte offsets. - fn char_indices<'a>(&'a self) -> CharOffsets<'a>; - - /// An iterator over substrings of `self`, separated by characters - /// matched by `sep`. - /// - /// # Example - /// - /// ```rust - /// # #![feature(unboxed_closures)] - /// - /// # fn main() { - /// let v: Vec<&str> = "Mary had a little lamb".split(' ').collect(); - /// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]); - /// - /// let v: Vec<&str> = "abc1def2ghi".split(|&: c: char| c.is_numeric()).collect(); - /// assert_eq!(v, vec!["abc", "def", "ghi"]); - /// - /// let v: Vec<&str> = "lionXXtigerXleopard".split('X').collect(); - /// assert_eq!(v, vec!["lion", "", "tiger", "leopard"]); - /// - /// let v: Vec<&str> = "".split('X').collect(); - /// assert_eq!(v, vec![""]); - /// # } - /// ``` + fn char_indices<'a>(&'a self) -> CharIndices<'a>; fn split<'a, Sep: CharEq>(&'a self, sep: Sep) -> CharSplits<'a, Sep>; - - /// An iterator over substrings of `self`, separated by characters - /// matched by `sep`, restricted to splitting at most `count` - /// times. - /// - /// # Example - /// - /// ```rust - /// # #![feature(unboxed_closures)] - /// - /// # fn main() { - /// let v: Vec<&str> = "Mary had a little lambda".splitn(2, ' ').collect(); - /// assert_eq!(v, vec!["Mary", "had", "a little lambda"]); - /// - /// let v: Vec<&str> = "abc1def2ghi".splitn(1, |&: c: char| c.is_numeric()).collect(); - /// assert_eq!(v, vec!["abc", "def2ghi"]); - /// - /// let v: Vec<&str> = "lionXXtigerXleopard".splitn(2, 'X').collect(); - /// assert_eq!(v, vec!["lion", "", "tigerXleopard"]); - /// - /// let v: Vec<&str> = "abcXdef".splitn(0, 'X').collect(); - /// assert_eq!(v, vec!["abcXdef"]); - /// - /// let v: Vec<&str> = "".splitn(1, 'X').collect(); - /// assert_eq!(v, vec![""]); - /// # } - /// ``` fn splitn<'a, Sep: CharEq>(&'a self, count: uint, sep: Sep) -> CharSplitsN<'a, Sep>; - - /// An iterator over substrings of `self`, separated by characters - /// matched by `sep`. - /// - /// Equivalent to `split`, except that the trailing substring - /// is skipped if empty (terminator semantics). - /// - /// # Example - /// - /// ```rust - /// # #![feature(unboxed_closures)] - /// - /// # fn main() { - /// let v: Vec<&str> = "A.B.".split_terminator('.').collect(); - /// assert_eq!(v, vec!["A", "B"]); - /// - /// let v: Vec<&str> = "A..B..".split_terminator('.').collect(); - /// assert_eq!(v, vec!["A", "", "B", ""]); - /// - /// let v: Vec<&str> = "Mary had a little lamb".split(' ').rev().collect(); - /// assert_eq!(v, vec!["lamb", "little", "a", "had", "Mary"]); - /// - /// let v: Vec<&str> = "abc1def2ghi".split(|&: c: char| c.is_numeric()).rev().collect(); - /// assert_eq!(v, vec!["ghi", "def", "abc"]); - /// - /// let v: Vec<&str> = "lionXXtigerXleopard".split('X').rev().collect(); - /// assert_eq!(v, vec!["leopard", "tiger", "", "lion"]); - /// # } - /// ``` fn split_terminator<'a, Sep: CharEq>(&'a self, sep: Sep) -> CharSplits<'a, Sep>; - - /// An iterator over substrings of `self`, separated by characters - /// matched by `sep`, starting from the end of the string. - /// Restricted to splitting at most `count` times. - /// - /// # Example - /// - /// ```rust - /// # #![feature(unboxed_closures)] - /// - /// # fn main() { - /// let v: Vec<&str> = "Mary had a little lamb".rsplitn(2, ' ').collect(); - /// assert_eq!(v, vec!["lamb", "little", "Mary had a"]); - /// - /// let v: Vec<&str> = "abc1def2ghi".rsplitn(1, |&: c: char| c.is_numeric()).collect(); - /// assert_eq!(v, vec!["ghi", "abc1def"]); - /// - /// let v: Vec<&str> = "lionXXtigerXleopard".rsplitn(2, 'X').collect(); - /// assert_eq!(v, vec!["leopard", "tiger", "lionX"]); - /// # } - /// ``` fn rsplitn<'a, Sep: CharEq>(&'a self, count: uint, sep: Sep) -> CharSplitsN<'a, Sep>; - - /// An iterator over the start and end indices of the disjoint - /// matches of `sep` within `self`. - /// - /// That is, each returned value `(start, end)` satisfies - /// `self.slice(start, end) == sep`. For matches of `sep` within - /// `self` that overlap, only the indices corresponding to the - /// first match are returned. - /// - /// # Example - /// - /// ```rust - /// let v: Vec<(uint, uint)> = "abcXXXabcYYYabc".match_indices("abc").collect(); - /// assert_eq!(v, vec![(0,3), (6,9), (12,15)]); - /// - /// let v: Vec<(uint, uint)> = "1abcabc2".match_indices("abc").collect(); - /// assert_eq!(v, vec![(1,4), (4,7)]); - /// - /// let v: Vec<(uint, uint)> = "ababa".match_indices("aba").collect(); - /// assert_eq!(v, vec![(0, 3)]); // only the first `aba` - /// ``` fn match_indices<'a>(&'a self, sep: &'a str) -> MatchIndices<'a>; - - /// An iterator over the substrings of `self` separated by `sep`. - /// - /// # Example - /// - /// ```rust - /// let v: Vec<&str> = "abcXXXabcYYYabc".split_str("abc").collect(); - /// assert_eq!(v, vec!["", "XXX", "YYY", ""]); - /// - /// let v: Vec<&str> = "1abcabc2".split_str("abc").collect(); - /// assert_eq!(v, vec!["1", "", "2"]); - /// ``` fn split_str<'a>(&'a self, &'a str) -> StrSplits<'a>; - - /// An iterator over the lines of a string (subsequences separated - /// by `\n`). This does not include the empty string after a - /// trailing `\n`. - /// - /// # Example - /// - /// ```rust - /// let four_lines = "foo\nbar\n\nbaz\n"; - /// let v: Vec<&str> = four_lines.lines().collect(); - /// assert_eq!(v, vec!["foo", "bar", "", "baz"]); - /// ``` - fn lines<'a>(&'a self) -> CharSplits<'a, char>; - - /// An iterator over the lines of a string, separated by either - /// `\n` or `\r\n`. As with `.lines()`, this does not include an - /// empty trailing line. - /// - /// # Example - /// - /// ```rust - /// let four_lines = "foo\r\nbar\n\r\nbaz\n"; - /// let v: Vec<&str> = four_lines.lines_any().collect(); - /// assert_eq!(v, vec!["foo", "bar", "", "baz"]); - /// ``` - fn lines_any<'a>(&'a self) -> AnyLines<'a>; - - /// Returns the number of Unicode code points (`char`) that a - /// string holds. - /// - /// This does not perform any normalization, and is `O(n)`, since - /// UTF-8 is a variable width encoding of code points. - /// - /// *Warning*: The number of code points in a string does not directly - /// correspond to the number of visible characters or width of the - /// visible text due to composing characters, and double- and - /// zero-width ones. - /// - /// See also `.len()` for the byte length. - /// - /// # Example - /// - /// ```rust - /// // composed forms of `ö` and `é` - /// let c = "Löwe 老虎 Léopard"; // German, Simplified Chinese, French - /// // decomposed forms of `ö` and `é` - /// let d = "Lo\u{0308}we 老虎 Le\u{0301}opard"; - /// - /// assert_eq!(c.char_len(), 15); - /// assert_eq!(d.char_len(), 17); - /// - /// assert_eq!(c.len(), 21); - /// assert_eq!(d.len(), 23); - /// - /// // the two strings *look* the same - /// println!("{}", c); - /// println!("{}", d); - /// ``` + fn lines<'a>(&'a self) -> Lines<'a>; + fn lines_any<'a>(&'a self) -> LinesAny<'a>; fn char_len(&self) -> uint; - - /// Returns a slice of the given string from the byte range - /// [`begin`..`end`). - /// - /// This operation is `O(1)`. - /// - /// Panics when `begin` and `end` do not point to valid characters - /// or point beyond the last character of the string. - /// - /// See also `slice_to` and `slice_from` for slicing prefixes and - /// suffixes of strings, and `slice_chars` for slicing based on - /// code point counts. - /// - /// # Example - /// - /// ```rust - /// let s = "Löwe 老虎 Léopard"; - /// assert_eq!(s.slice(0, 1), "L"); - /// - /// assert_eq!(s.slice(1, 9), "öwe 老"); - /// - /// // these will panic: - /// // byte 2 lies within `ö`: - /// // s.slice(2, 3); - /// - /// // byte 8 lies within `老` - /// // s.slice(1, 8); - /// - /// // byte 100 is outside the string - /// // s.slice(3, 100); - /// ``` fn slice<'a>(&'a self, begin: uint, end: uint) -> &'a str; - - /// Returns a slice of the string from `begin` to its end. - /// - /// Equivalent to `self.slice(begin, self.len())`. - /// - /// Panics when `begin` does not point to a valid character, or is - /// out of bounds. - /// - /// See also `slice`, `slice_to` and `slice_chars`. fn slice_from<'a>(&'a self, begin: uint) -> &'a str; - - /// Returns a slice of the string from the beginning to byte - /// `end`. - /// - /// Equivalent to `self.slice(0, end)`. - /// - /// Panics when `end` does not point to a valid character, or is - /// out of bounds. - /// - /// See also `slice`, `slice_from` and `slice_chars`. fn slice_to<'a>(&'a self, end: uint) -> &'a str; - - /// Returns a slice of the string from the character range - /// [`begin`..`end`). - /// - /// That is, start at the `begin`-th code point of the string and - /// continue to the `end`-th code point. This does not detect or - /// handle edge cases such as leaving a combining character as the - /// first code point of the string. - /// - /// Due to the design of UTF-8, this operation is `O(end)`. - /// See `slice`, `slice_to` and `slice_from` for `O(1)` - /// variants that use byte indices rather than code point - /// indices. - /// - /// Panics if `begin` > `end` or the either `begin` or `end` are - /// beyond the last character of the string. - /// - /// # Example - /// - /// ```rust - /// let s = "Löwe 老虎 Léopard"; - /// assert_eq!(s.slice_chars(0, 4), "Löwe"); - /// assert_eq!(s.slice_chars(5, 7), "老虎"); - /// ``` fn slice_chars<'a>(&'a self, begin: uint, end: uint) -> &'a str; - - /// Takes a bytewise (not UTF-8) slice from a string. - /// - /// Returns the substring from [`begin`..`end`). - /// - /// Caller must check both UTF-8 character boundaries and the boundaries of - /// the entire slice as well. unsafe fn slice_unchecked<'a>(&'a self, begin: uint, end: uint) -> &'a str; - - /// Returns true if `needle` is a prefix of the string. - /// - /// # Example - /// - /// ```rust - /// assert!("banana".starts_with("ba")); - /// ``` fn starts_with(&self, needle: &str) -> bool; - - /// Returns true if `needle` is a suffix of the string. - /// - /// # Example - /// - /// ```rust - /// assert!("banana".ends_with("nana")); - /// ``` fn ends_with(&self, needle: &str) -> bool; - - /// Returns a string with characters that match `to_trim` removed from the left and the right. - /// - /// # Arguments - /// - /// * to_trim - a character matcher - /// - /// # Example - /// - /// ```rust - /// # #![feature(unboxed_closures)] - /// - /// # fn main() { - /// assert_eq!("11foo1bar11".trim_chars('1'), "foo1bar"); - /// let x: &[_] = &['1', '2']; - /// assert_eq!("12foo1bar12".trim_chars(x), "foo1bar"); - /// assert_eq!("123foo1bar123".trim_chars(|&: c: char| c.is_numeric()), "foo1bar"); - /// # } - /// ``` fn trim_chars<'a, C: CharEq>(&'a self, to_trim: C) -> &'a str; - - /// Returns a string with leading `chars_to_trim` removed. - /// - /// # Arguments - /// - /// * to_trim - a character matcher - /// - /// # Example - /// - /// ```rust - /// # #![feature(unboxed_closures)] - /// - /// # fn main() { - /// assert_eq!("11foo1bar11".trim_left_chars('1'), "foo1bar11"); - /// let x: &[_] = &['1', '2']; - /// assert_eq!("12foo1bar12".trim_left_chars(x), "foo1bar12"); - /// assert_eq!("123foo1bar123".trim_left_chars(|&: c: char| c.is_numeric()), "foo1bar123"); - /// # } - /// ``` fn trim_left_chars<'a, C: CharEq>(&'a self, to_trim: C) -> &'a str; - - /// Returns a string with trailing `chars_to_trim` removed. - /// - /// # Arguments - /// - /// * to_trim - a character matcher - /// - /// # Example - /// - /// ```rust - /// # #![feature(unboxed_closures)] - /// - /// # fn main() { - /// assert_eq!("11foo1bar11".trim_right_chars('1'), "11foo1bar"); - /// let x: &[_] = &['1', '2']; - /// assert_eq!("12foo1bar12".trim_right_chars(x), "12foo1bar"); - /// assert_eq!("123foo1bar123".trim_right_chars(|&: c: char| c.is_numeric()), "123foo1bar"); - /// # } - /// ``` fn trim_right_chars<'a, C: CharEq>(&'a self, to_trim: C) -> &'a str; - - /// Check that `index`-th byte lies at the start and/or end of a - /// UTF-8 code point sequence. - /// - /// The start and end of the string (when `index == self.len()`) - /// are considered to be boundaries. - /// - /// Panics if `index` is greater than `self.len()`. - /// - /// # Example - /// - /// ```rust - /// let s = "Löwe 老虎 Léopard"; - /// assert!(s.is_char_boundary(0)); - /// // start of `老` - /// assert!(s.is_char_boundary(6)); - /// assert!(s.is_char_boundary(s.len())); - /// - /// // second byte of `ö` - /// assert!(!s.is_char_boundary(2)); - /// - /// // third byte of `老` - /// assert!(!s.is_char_boundary(8)); - /// ``` fn is_char_boundary(&self, index: uint) -> bool; - - /// Pluck a character out of a string and return the index of the next - /// character. - /// - /// This function can be used to iterate over the Unicode characters of a - /// string. - /// - /// # Example - /// - /// This example manually iterates through the characters of a - /// string; this should normally be done by `.chars()` or - /// `.char_indices`. - /// - /// ```rust - /// use std::str::CharRange; - /// - /// let s = "中华Việt Nam"; - /// let mut i = 0u; - /// while i < s.len() { - /// let CharRange {ch, next} = s.char_range_at(i); - /// println!("{}: {}", i, ch); - /// i = next; - /// } - /// ``` - /// - /// This outputs: - /// - /// ```text - /// 0: 中 - /// 3: 华 - /// 6: V - /// 7: i - /// 8: ệ - /// 11: t - /// 12: - /// 13: N - /// 14: a - /// 15: m - /// ``` - /// - /// # Arguments - /// - /// * s - The string - /// * i - The byte offset of the char to extract - /// - /// # Return value - /// - /// A record {ch: char, next: uint} containing the char value and the byte - /// index of the next Unicode character. - /// - /// # Panics - /// - /// If `i` is greater than or equal to the length of the string. - /// If `i` is not the index of the beginning of a valid UTF-8 character. fn char_range_at(&self, start: uint) -> CharRange; - - /// Given a byte position and a str, return the previous char and its position. - /// - /// This function can be used to iterate over a Unicode string in reverse. - /// - /// Returns 0 for next index if called on start index 0. - /// - /// # Panics - /// - /// If `i` is greater than the length of the string. - /// If `i` is not an index following a valid UTF-8 character. fn char_range_at_reverse(&self, start: uint) -> CharRange; - - /// Plucks the character starting at the `i`th byte of a string. - /// - /// # Example - /// - /// ```rust - /// let s = "abπc"; - /// assert_eq!(s.char_at(1), 'b'); - /// assert_eq!(s.char_at(2), 'π'); - /// assert_eq!(s.char_at(4), 'c'); - /// ``` - /// - /// # Panics - /// - /// If `i` is greater than or equal to the length of the string. - /// If `i` is not the index of the beginning of a valid UTF-8 character. fn char_at(&self, i: uint) -> char; - - /// Plucks the character ending at the `i`th byte of a string. - /// - /// # Panics - /// - /// If `i` is greater than the length of the string. - /// If `i` is not an index following a valid UTF-8 character. fn char_at_reverse(&self, i: uint) -> char; - - /// Work with the byte buffer of a string as a byte slice. - /// - /// # Example - /// - /// ```rust - /// assert_eq!("bors".as_bytes(), b"bors"); - /// ``` fn as_bytes<'a>(&'a self) -> &'a [u8]; - - /// Returns the byte index of the first character of `self` that - /// matches `search`. - /// - /// # Return value - /// - /// `Some` containing the byte index of the last matching character - /// or `None` if there is no match - /// - /// # Example - /// - /// ```rust - /// # #![feature(unboxed_closures)] - /// - /// # fn main() { - /// let s = "Löwe 老虎 Léopard"; - /// - /// assert_eq!(s.find('L'), Some(0)); - /// assert_eq!(s.find('é'), Some(14)); - /// - /// // the first space - /// assert_eq!(s.find(|&: c: char| c.is_whitespace()), Some(5)); - /// - /// // neither are found - /// let x: &[_] = &['1', '2']; - /// assert_eq!(s.find(x), None); - /// # } - /// ``` fn find(&self, search: C) -> Option; - - /// Returns the byte index of the last character of `self` that - /// matches `search`. - /// - /// # Return value - /// - /// `Some` containing the byte index of the last matching character - /// or `None` if there is no match. - /// - /// # Example - /// - /// ```rust - /// # #![feature(unboxed_closures)] - /// - /// # fn main() { - /// let s = "Löwe 老虎 Léopard"; - /// - /// assert_eq!(s.rfind('L'), Some(13)); - /// assert_eq!(s.rfind('é'), Some(14)); - /// - /// // the second space - /// assert_eq!(s.rfind(|&: c: char| c.is_whitespace()), Some(12)); - /// - /// // searches for an occurrence of either `1` or `2`, but neither are found - /// let x: &[_] = &['1', '2']; - /// assert_eq!(s.rfind(x), None); - /// # } - /// ``` fn rfind(&self, search: C) -> Option; - - /// Returns the byte index of the first matching substring - /// - /// # Arguments - /// - /// * `needle` - The string to search for - /// - /// # Return value - /// - /// `Some` containing the byte index of the first matching substring - /// or `None` if there is no match. - /// - /// # Example - /// - /// ```rust - /// let s = "Löwe 老虎 Léopard"; - /// - /// assert_eq!(s.find_str("老虎 L"), Some(6)); - /// assert_eq!(s.find_str("muffin man"), None); - /// ``` fn find_str(&self, &str) -> Option; - - /// Retrieves the first character from a string slice and returns - /// it. This does not allocate a new string; instead, it returns a - /// slice that point one character beyond the character that was - /// shifted. If the string does not contain any characters, - /// None is returned instead. - /// - /// # Example - /// - /// ```rust - /// let s = "Löwe 老虎 Léopard"; - /// let (c, s1) = s.slice_shift_char().unwrap(); - /// assert_eq!(c, 'L'); - /// assert_eq!(s1, "öwe 老虎 Léopard"); - /// - /// let (c, s2) = s1.slice_shift_char().unwrap(); - /// assert_eq!(c, 'ö'); - /// assert_eq!(s2, "we 老虎 Léopard"); - /// ``` fn slice_shift_char<'a>(&'a self) -> Option<(char, &'a str)>; - - /// Returns the byte offset of an inner slice relative to an enclosing outer slice. - /// - /// Panics if `inner` is not a direct slice contained within self. - /// - /// # Example - /// - /// ```rust - /// let string = "a\nb\nc"; - /// let lines: Vec<&str> = string.lines().collect(); - /// - /// assert!(string.subslice_offset(lines[0]) == 0); // &"a" - /// assert!(string.subslice_offset(lines[1]) == 2); // &"b" - /// assert!(string.subslice_offset(lines[2]) == 4); // &"c" - /// ``` fn subslice_offset(&self, inner: &str) -> uint; - - /// Return an unsafe pointer to the strings buffer. - /// - /// The caller must ensure that the string outlives this pointer, - /// and that it is not reallocated (e.g. by pushing to the - /// string). fn as_ptr(&self) -> *const u8; - - /// Return an iterator of `u16` over the string encoded as UTF-16. - fn utf16_units<'a>(&'a self) -> Utf16CodeUnits<'a>; - - /// Return the number of bytes in this string - /// - /// # Example - /// - /// ``` - /// assert_eq!("foo".len(), 3); - /// assert_eq!("ƒoo".len(), 4); - /// ``` - #[experimental = "not triaged yet"] fn len(&self) -> uint; - - /// Returns true if this slice contains no bytes - /// - /// # Example - /// - /// ``` - /// assert!("".is_empty()); - /// ``` - #[inline] - #[experimental = "not triaged yet"] - fn is_empty(&self) -> bool { self.len() == 0 } + fn is_empty(&self) -> bool; } #[inline(never)] @@ -2001,7 +1227,7 @@ fn slice_error_fail(s: &str, begin: uint, end: uint) -> ! { begin, end, s); } -impl StrPrelude for str { +impl StrExt for str { #[inline] fn contains(&self, needle: &str) -> bool { self.find_str(needle).is_some() @@ -2021,12 +1247,12 @@ impl StrPrelude for str { fn bytes(&self) -> Bytes { fn deref(&x: &u8) -> u8 { x } - self.as_bytes().iter().map(BytesFn(deref)) + Bytes { inner: self.as_bytes().iter().map(BytesFn(deref)) } } #[inline] - fn char_indices(&self) -> CharOffsets { - CharOffsets{front_offset: 0, iter: self.chars()} + fn char_indices(&self) -> CharIndices { + CharIndices { front_offset: 0, iter: self.chars() } } #[inline] @@ -2089,18 +1315,18 @@ impl StrPrelude for str { } #[inline] - fn lines(&self) -> CharSplits { - self.split_terminator('\n') + fn lines(&self) -> Lines { + Lines { inner: self.split_terminator('\n') } } - fn lines_any(&self) -> AnyLines { + fn lines_any(&self) -> LinesAny { fn f(line: &str) -> &str { let l = line.len(); if l > 0 && line.as_bytes()[l - 1] == b'\r' { line.slice(0, l - 1) } else { line } } - self.lines().map(f) + LinesAny { inner: self.lines().map(f) } } #[inline] @@ -2353,12 +1579,10 @@ impl StrPrelude for str { } #[inline] - fn utf16_units(&self) -> Utf16CodeUnits { - Utf16CodeUnits { encoder: Utf16Encoder::new(self.chars()) } - } + fn len(&self) -> uint { self.repr().len } #[inline] - fn len(&self) -> uint { self.repr().len } + fn is_empty(&self) -> bool { self.len() == 0 } } #[stable] @@ -2367,3 +1591,29 @@ impl<'a> Default for &'a str { fn default() -> &'a str { "" } } + +impl<'a> Iterator<&'a str> for Lines<'a> { + #[inline] + fn next(&mut self) -> Option<&'a str> { self.inner.next() } +} +impl<'a> DoubleEndedIterator<&'a str> for Lines<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() } +} +impl<'a> Iterator<&'a str> for LinesAny<'a> { + #[inline] + fn next(&mut self) -> Option<&'a str> { self.inner.next() } +} +impl<'a> DoubleEndedIterator<&'a str> for LinesAny<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() } +} +impl<'a> Iterator for Bytes<'a> { + #[inline] + fn next(&mut self) -> Option { self.inner.next() } +} +impl<'a> DoubleEndedIterator for Bytes<'a> { + #[inline] + fn next_back(&mut self) -> Option { self.inner.next_back() } +} +impl<'a> ExactSizeIterator for Bytes<'a> {} diff --git a/src/librustc/lib.rs b/src/librustc/lib.rs index 463dcddaf94..4a1dd121516 100644 --- a/src/librustc/lib.rs +++ b/src/librustc/lib.rs @@ -115,6 +115,7 @@ pub mod util { pub mod ppaux; pub mod nodemap; pub mod snapshot_vec; + pub mod lev_distance; } pub mod lib { diff --git a/src/librustc/util/lev_distance.rs b/src/librustc/util/lev_distance.rs new file mode 100644 index 00000000000..24e98837444 --- /dev/null +++ b/src/librustc/util/lev_distance.rs @@ -0,0 +1,63 @@ +// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::cmp; + +pub fn lev_distance(me: &str, t: &str) -> uint { + if me.is_empty() { return t.chars().count(); } + if t.is_empty() { return me.chars().count(); } + + let mut dcol = Vec::from_fn(t.len() + 1, |x| x); + let mut t_last = 0; + + for (i, sc) in me.chars().enumerate() { + + let mut current = i; + dcol[0] = current + 1; + + for (j, tc) in t.chars().enumerate() { + + let next = dcol[j + 1]; + + if sc == tc { + dcol[j + 1] = current; + } else { + dcol[j + 1] = cmp::min(current, next); + dcol[j + 1] = cmp::min(dcol[j + 1], dcol[j]) + 1; + } + + current = next; + t_last = j; + } + } + + dcol[t_last + 1] +} + +#[test] +fn test_lev_distance() { + use std::char::{ from_u32, MAX }; + // Test bytelength agnosticity + for c in range(0u32, MAX as u32) + .filter_map(|i| from_u32(i)) + .map(|i| String::from_char(1, i)) { + assert_eq!(lev_distance(c[], c[]), 0); + } + + let a = "\nMäry häd ä little lämb\n\nLittle lämb\n"; + let b = "\nMary häd ä little lämb\n\nLittle lämb\n"; + let c = "Mary häd ä little lämb\n\nLittle lämb\n"; + assert_eq!(lev_distance(a, b), 1); + assert_eq!(lev_distance(b, a), 1); + assert_eq!(lev_distance(a, c), 2); + assert_eq!(lev_distance(c, a), 2); + assert_eq!(lev_distance(b, c), 1); + assert_eq!(lev_distance(c, b), 1); +} diff --git a/src/librustc_resolve/lib.rs b/src/librustc_resolve/lib.rs index ac8d5d1e977..d4a0b49436d 100644 --- a/src/librustc_resolve/lib.rs +++ b/src/librustc_resolve/lib.rs @@ -57,6 +57,7 @@ use rustc::middle::privacy::*; use rustc::middle::subst::{ParamSpace, FnSpace, TypeSpace}; use rustc::middle::ty::{CaptureModeMap, Freevar, FreevarMap, TraitMap}; use rustc::util::nodemap::{NodeMap, NodeSet, DefIdSet, FnvHashMap}; +use rustc::util::lev_distance::lev_distance; use syntax::ast::{Arm, BindByRef, BindByValue, BindingMode, Block, Crate, CrateNum}; use syntax::ast::{DeclItem, DefId, Expr, ExprAgain, ExprBreak, ExprField}; @@ -96,8 +97,8 @@ use std::mem::replace; use std::rc::{Rc, Weak}; use std::uint; -mod check_unused; -mod record_exports; +// Definition mapping +pub type DefMap = RefCell>; #[deriving(Copy)] struct BindingInfo { @@ -5539,7 +5540,7 @@ impl<'a> Resolver<'a> { let mut smallest = 0; for (i, other) in maybes.iter().enumerate() { - values[i] = name.lev_distance(other.get()); + values[i] = lev_distance(name, other.get()); if values[i] <= values[smallest] { smallest = i; diff --git a/src/libstd/error.rs b/src/libstd/error.rs index 9ad2655f6e9..cd7d9aacc90 100644 --- a/src/libstd/error.rs +++ b/src/libstd/error.rs @@ -78,10 +78,9 @@ //! } //! ``` -use option::Option; -use option::Option::None; -use kinds::Send; -use string::String; +use prelude::*; + +use str::Utf8Error; /// Base functionality for all errors in Rust. pub trait Error: Send { @@ -107,3 +106,14 @@ impl FromError for E { err } } + +impl Error for Utf8Error { + fn description(&self) -> &str { + match *self { + Utf8Error::TooShort => "invalid utf-8: not enough bytes", + Utf8Error::InvalidByte(..) => "invalid utf-8: corrupt contents", + } + } + + fn detail(&self) -> Option { Some(self.to_string()) } +} diff --git a/src/libstd/os.rs b/src/libstd/os.rs index 258e8964a9f..a16ee982f5c 100644 --- a/src/libstd/os.rs +++ b/src/libstd/os.rs @@ -729,7 +729,7 @@ fn real_args() -> Vec { // Push it onto the list. let ptr = ptr as *const u16; let buf = slice::from_raw_buf(&ptr, len); - let opt_s = String::from_utf16(::str::truncate_utf16_at_nul(buf)); + let opt_s = String::from_utf16(os_imp::truncate_utf16_at_nul(buf)); opt_s.expect("CommandLineToArgvW returned invalid UTF-16") }); diff --git a/src/libstd/sys/windows/os.rs b/src/libstd/sys/windows/os.rs index e2220b7b67b..e1016048e58 100644 --- a/src/libstd/sys/windows/os.rs +++ b/src/libstd/sys/windows/os.rs @@ -31,6 +31,16 @@ use libc::types::os::arch::extra::DWORD; const BUF_BYTES : uint = 2048u; +/// Return a slice of `v` ending at (and not including) the first NUL +/// (0). +pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] { + match v.iter().position(|c| *c == 0) { + // don't include the 0 + Some(i) => v[..i], + None => v + } +} + pub fn errno() -> uint { use libc::types::os::arch::extra::DWORD; @@ -87,7 +97,7 @@ pub fn error_string(errnum: i32) -> String { return format!("OS Error {} (FormatMessageW() returned error {})", errnum, fm_err); } - let msg = String::from_utf16(::str::truncate_utf16_at_nul(&buf)); + let msg = String::from_utf16(truncate_utf16_at_nul(&buf)); match msg { Some(msg) => format!("OS Error {}: {}", errnum, msg), None => format!("OS Error {} (FormatMessageW() returned invalid UTF-16)", errnum), @@ -294,3 +304,30 @@ pub fn page_size() -> uint { return info.dwPageSize as uint; } } + +#[cfg(test)] +mod tests { + use super::truncate_utf16_at_nul; + + #[test] + fn test_truncate_utf16_at_nul() { + let v = []; + let b: &[u16] = &[]; + assert_eq!(truncate_utf16_at_nul(&v), b); + + let v = [0, 2, 3]; + assert_eq!(truncate_utf16_at_nul(&v), b); + + let v = [1, 0, 3]; + let b: &[u16] = &[1]; + assert_eq!(truncate_utf16_at_nul(&v), b); + + let v = [1, 2, 0]; + let b: &[u16] = &[1, 2]; + assert_eq!(truncate_utf16_at_nul(&v), b); + + let v = [1, 2, 3]; + let b: &[u16] = &[1, 2, 3]; + assert_eq!(truncate_utf16_at_nul(&v), b); + } +} diff --git a/src/libunicode/lib.rs b/src/libunicode/lib.rs index 1f75daa7bde..d33362ec232 100644 --- a/src/libunicode/lib.rs +++ b/src/libunicode/lib.rs @@ -28,8 +28,7 @@ html_root_url = "http://doc.rust-lang.org/nightly/", html_playground_url = "http://play.rust-lang.org/")] #![no_std] -#![feature(globs)] -#![feature(unboxed_closures)] +#![feature(globs, macro_rules, slicing_syntax, unboxed_closures)] extern crate core; @@ -74,11 +73,14 @@ pub mod char { } pub mod str { - pub use u_str::{UnicodeStrPrelude, Words, Graphemes, GraphemeIndices}; + pub use u_str::{UnicodeStr, Words, Graphemes, GraphemeIndices}; + pub use u_str::{utf8_char_width, is_utf16, Utf16Items, Utf16Item}; + pub use u_str::{utf16_items, Utf16Encoder}; } -// this lets us use #[deriving(Clone)] +// this lets us use #[deriving(..)] mod std { pub use core::clone; pub use core::cmp; + pub use core::fmt; } diff --git a/src/libunicode/u_str.rs b/src/libunicode/u_str.rs index 5e98109c432..5d7d2951628 100644 --- a/src/libunicode/u_str.rs +++ b/src/libunicode/u_str.rs @@ -15,24 +15,36 @@ //! This module provides functionality to `str` that requires the Unicode methods provided by the //! UnicodeChar trait. -use self::GraphemeState::*; +use core::prelude::*; + +use core::char; use core::cmp; -use core::slice::SliceExt; -use core::iter::{Filter, AdditiveIterator, Iterator, IteratorExt}; use core::iter::{DoubleEndedIterator, DoubleEndedIteratorExt}; +use core::iter::{Filter, AdditiveIterator, Iterator, IteratorExt}; +use core::iter::{Filter, AdditiveIterator}; use core::kinds::Sized; -use core::option::Option; +use core::mem; +use core::num::Int; use core::option::Option::{None, Some}; +use core::option::Option; +use core::slice::SliceExt; +use core::slice; use core::str::{CharSplits, StrPrelude}; +use core::str::{CharSplits}; + use u_char::UnicodeChar; use tables::grapheme::GraphemeCat; /// An iterator over the words of a string, separated by a sequence of whitespace /// FIXME: This should be opaque -pub type Words<'a> = Filter<&'a str, CharSplits<'a, fn(char) -> bool>, fn(&&str) -> bool>; +#[stable] +pub struct Words<'a> { + inner: Filter<'a, &'a str, CharSplits<'a, |char|:'a -> bool>, + fn(&&str) -> bool>, +} /// Methods for Unicode string slices -pub trait UnicodeStrPrelude for Sized? { +pub trait UnicodeStr for Sized? { /// Returns an iterator over the /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) /// of the string. @@ -77,6 +89,7 @@ pub trait UnicodeStrPrelude for Sized? { /// let v: Vec<&str> = some_words.words().collect(); /// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]); /// ``` + #[stable] fn words<'a>(&'a self) -> Words<'a>; /// Returns true if the string contains only whitespace. @@ -129,7 +142,7 @@ pub trait UnicodeStrPrelude for Sized? { fn trim_right<'a>(&'a self) -> &'a str; } -impl UnicodeStrPrelude for str { +impl UnicodeStr for str { #[inline] fn graphemes(&self, is_extended: bool) -> Graphemes { Graphemes { string: self, extended: is_extended, cat: None, catb: None } @@ -145,7 +158,7 @@ impl UnicodeStrPrelude for str { fn is_not_empty(s: &&str) -> bool { !s.is_empty() } fn is_whitespace(c: char) -> bool { c.is_whitespace() } - self.split(is_whitespace).filter(is_not_empty) + Words { inner: self.split(is_whitespace).filter(is_not_empty) } } #[inline] @@ -428,3 +441,196 @@ impl<'a> DoubleEndedIterator<&'a str> for Graphemes<'a> { Some(retstr) } } + +// https://tools.ietf.org/html/rfc3629 +static UTF8_CHAR_WIDTH: [u8, ..256] = [ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF +0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF +4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF +]; + +/// Given a first byte, determine how many bytes are in this UTF-8 character +#[inline] +pub fn utf8_char_width(b: u8) -> uint { + return UTF8_CHAR_WIDTH[b as uint] as uint; +} + +/// Determines if a vector of `u16` contains valid UTF-16 +pub fn is_utf16(v: &[u16]) -> bool { + let mut it = v.iter(); + macro_rules! next ( ($ret:expr) => { + match it.next() { Some(u) => *u, None => return $ret } + } + ) + loop { + let u = next!(true); + + match char::from_u32(u as u32) { + Some(_) => {} + None => { + let u2 = next!(false); + if u < 0xD7FF || u > 0xDBFF || + u2 < 0xDC00 || u2 > 0xDFFF { return false; } + } + } + } +} + +/// An iterator that decodes UTF-16 encoded codepoints from a vector +/// of `u16`s. +#[deriving(Clone)] +pub struct Utf16Items<'a> { + iter: slice::Items<'a, u16> +} +/// The possibilities for values decoded from a `u16` stream. +#[deriving(PartialEq, Eq, Clone, Show)] +pub enum Utf16Item { + /// A valid codepoint. + ScalarValue(char), + /// An invalid surrogate without its pair. + LoneSurrogate(u16) +} + +impl Copy for Utf16Item {} + +impl Utf16Item { + /// Convert `self` to a `char`, taking `LoneSurrogate`s to the + /// replacement character (U+FFFD). + #[inline] + pub fn to_char_lossy(&self) -> char { + match *self { + Utf16Item::ScalarValue(c) => c, + Utf16Item::LoneSurrogate(_) => '\uFFFD' + } + } +} + +impl<'a> Iterator for Utf16Items<'a> { + fn next(&mut self) -> Option { + let u = match self.iter.next() { + Some(u) => *u, + None => return None + }; + + if u < 0xD800 || 0xDFFF < u { + // not a surrogate + Some(Utf16Item::ScalarValue(unsafe {mem::transmute(u as u32)})) + } else if u >= 0xDC00 { + // a trailing surrogate + Some(Utf16Item::LoneSurrogate(u)) + } else { + // preserve state for rewinding. + let old = self.iter; + + let u2 = match self.iter.next() { + Some(u2) => *u2, + // eof + None => return Some(Utf16Item::LoneSurrogate(u)) + }; + if u2 < 0xDC00 || u2 > 0xDFFF { + // not a trailing surrogate so we're not a valid + // surrogate pair, so rewind to redecode u2 next time. + self.iter = old; + return Some(Utf16Item::LoneSurrogate(u)) + } + + // all ok, so lets decode it. + let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000; + Some(Utf16Item::ScalarValue(unsafe {mem::transmute(c)})) + } + } + + #[inline] + fn size_hint(&self) -> (uint, Option) { + let (low, high) = self.iter.size_hint(); + // we could be entirely valid surrogates (2 elements per + // char), or entirely non-surrogates (1 element per char) + (low / 2, high) + } +} + +/// Create an iterator over the UTF-16 encoded codepoints in `v`, +/// returning invalid surrogates as `LoneSurrogate`s. +/// +/// # Example +/// +/// ```rust +/// use std::str; +/// use std::str::{ScalarValue, LoneSurrogate}; +/// +/// // 𝄞music +/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, +/// 0x0073, 0xDD1E, 0x0069, 0x0063, +/// 0xD834]; +/// +/// assert_eq!(str::utf16_items(&v).collect::>(), +/// vec![ScalarValue('𝄞'), +/// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'), +/// LoneSurrogate(0xDD1E), +/// ScalarValue('i'), ScalarValue('c'), +/// LoneSurrogate(0xD834)]); +/// ``` +pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> { + Utf16Items { iter : v.iter() } +} + +/// Iterator adaptor for encoding `char`s to UTF-16. +#[deriving(Clone)] +pub struct Utf16Encoder { + chars: I, + extra: u16 +} + +impl Utf16Encoder { + /// Create an UTF-16 encoder from any `char` iterator. + pub fn new(chars: I) -> Utf16Encoder where I: Iterator { + Utf16Encoder { chars: chars, extra: 0 } + } +} + +impl Iterator for Utf16Encoder where I: Iterator { + #[inline] + fn next(&mut self) -> Option { + if self.extra != 0 { + let tmp = self.extra; + self.extra = 0; + return Some(tmp); + } + + let mut buf = [0u16, ..2]; + self.chars.next().map(|ch| { + let n = ch.encode_utf16(buf[mut]).unwrap_or(0); + if n == 2 { self.extra = buf[1]; } + buf[0] + }) + } + + #[inline] + fn size_hint(&self) -> (uint, Option) { + let (low, high) = self.chars.size_hint(); + // every char gets either one u16 or two u16, + // so this iterator is between 1 or 2 times as + // long as the underlying iterator. + (low, high.and_then(|n| n.checked_mul(2))) + } +} + +impl<'a> Iterator<&'a str> for Words<'a> { + fn next(&mut self) -> Option<&'a str> { self.inner.next() } +} +impl<'a> DoubleEndedIterator<&'a str> for Words<'a> { + fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() } +}