From b59d50368e5223d9c66c7a64c5aeff2b8e9b343f Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Sun, 25 Aug 2013 08:54:47 +0200 Subject: [PATCH] std::str: Double-ended CharSplitIterator Add new methods `.rsplit_iter()` and `.rsplitn_iter()` for &str. Separate out CharSplitIterator and CharSplitNIterator, CharSplitIterator (`split_iter` and `rsplit_iter`) is made double-ended while `splitn_iter` and `rsplitn_iter` (limited to N splits) are not, since these don't have the same symmetry. With CharSplitIterator being double ended, derived iterators like `line_iter` and `word_iter` are too. --- src/libstd/str.rs | 298 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 229 insertions(+), 69 deletions(-) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index bee354a0a5d..d4f1c37f7a2 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -21,9 +21,10 @@ use char; use char::Char; use clone::{Clone, DeepClone}; use container::{Container, Mutable}; +use either::{Left, Right}; use iter::Times; use iterator::{Iterator, FromIterator, Extendable}; -use iterator::{Filter, AdditiveIterator, Map, Enumerate}; +use iterator::{Filter, AdditiveIterator, Map}; use iterator::{Invert, DoubleEndedIterator}; use libc; use num::{Saturating, Zero}; @@ -359,28 +360,32 @@ pub type ByteIterator<'self> = /// Use with the `std::iterator` module. pub type ByteRevIterator<'self> = Invert>; -/// An iterator over byte index and either &u8 or char -#[deriving(Clone)] -enum OffsetIterator<'self> { - // use ByteIterator here when it can be cloned - ByteOffset(Enumerate>), - CharOffset(CharOffsetIterator<'self>), -} - /// An iterator over the substrings of a string, separated by `sep`. #[deriving(Clone)] -pub struct CharSplitIterator<'self,Sep> { - priv iter: OffsetIterator<'self>, +pub struct CharSplitIterator<'self, Sep> { + /// The slice remaining to be iterated priv string: &'self str, - priv position: uint, priv sep: Sep, - /// The number of splits remaining - priv count: uint, /// Whether an empty string at the end is allowed priv allow_trailing_empty: bool, + priv only_ascii: bool, priv finished: bool, } +/// An iterator over the substrings of a string, separated by `sep`, +/// starting from the back of the string. +pub type CharRSplitIterator<'self, Sep> = Invert>; + +/// An iterator over the substrings of a string, separated by `sep`, +/// splitting at most `count` times. +#[deriving(Clone)] +pub struct CharSplitNIterator<'self, Sep> { + priv iter: CharSplitIterator<'self, Sep>, + /// The number of splits remaining + priv count: uint, + priv invert: bool, +} + /// An iterator over the words of a string, separated by an sequence of whitespace pub type WordIterator<'self> = Filter<'self, &'self str, CharSplitIterator<'self, extern "Rust" fn(char) -> bool>>; @@ -389,46 +394,111 @@ pub type WordIterator<'self> = pub type AnyLineIterator<'self> = Map<'self, &'self str, &'self str, CharSplitIterator<'self, char>>; +impl<'self, Sep> CharSplitIterator<'self, Sep> { + #[inline] + fn get_end(&mut self) -> Option<&'self str> { + if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) { + self.finished = true; + Some(self.string) + } else { + None + } + } +} + impl<'self, Sep: CharEq> Iterator<&'self str> for CharSplitIterator<'self, Sep> { #[inline] fn next(&mut self) -> Option<&'self str> { if self.finished { return None } - let start = self.position; let len = self.string.len(); + let mut iter = match self.only_ascii { + true => Left(self.string.byte_iter().enumerate()), + false => Right(self.string.char_offset_iter()) + }; - if self.count > 0 { - match self.iter { + loop { + let (idx, next) = match iter { // this gives a *huge* speed up for splitting on ASCII // characters (e.g. '\n' or ' ') - ByteOffset(ref mut iter) => - for (idx, &byte) in *iter { - if self.sep.matches(byte as char) { - self.position = idx + 1; - self.count -= 1; - return Some(unsafe { - raw::slice_bytes(self.string, start, idx) - }) - } - }, - CharOffset(ref mut iter) => - for (idx, ch) in *iter { - if self.sep.matches(ch) { - // skip over the separator - self.position = self.string.char_range_at(idx).next; - self.count -= 1; - return Some(unsafe { - raw::slice_bytes(self.string, start, idx) - }) - } + Left(ref mut it) => match it.next() { + Some((idx, byte)) if byte < 128u8 && self.sep.matches(byte as char) => + (idx, idx + 1), + Some(*) => loop, + None => break, + }, + Right(ref mut it) => match it.next() { + Some((idx, ch)) if self.sep.matches(ch) => + (idx, self.string.char_range_at(idx).next), + Some(*) => loop, + None => break, + } + }; + unsafe { + let elt = raw::slice_bytes(self.string, 0, idx); + self.string = raw::slice_bytes(self.string, next, len); + return Some(elt) + } + } + self.get_end() + } +} + +impl<'self, Sep: CharEq> DoubleEndedIterator<&'self str> +for CharSplitIterator<'self, Sep> { + #[inline] + fn next_back(&mut self) -> Option<&'self str> { + if self.finished { return None } + + if !self.allow_trailing_empty { + self.allow_trailing_empty = true; + match self.next_back() { + Some(elt) if !elt.is_empty() => return Some(elt), + _ => if self.finished { return None } + } + } + let len = self.string.len(); + let mut iter = match self.only_ascii { + true => Left(self.string.byte_rev_iter().enumerate()), + false => Right(self.string.char_offset_iter()) + }; + + loop { + let (idx, next) = match iter { + Left(ref mut it) => match it.next() { + Some((j, byte)) if byte < 128u8 && self.sep.matches(byte as char) => { + let idx = self.string.len() - j - 1; + (idx, idx + 1) }, + Some(*) => loop, + None => break, + }, + Right(ref mut it) => match it.next_back() { + Some((idx, ch)) if self.sep.matches(ch) => + (idx, self.string.char_range_at(idx).next), + Some(*) => loop, + None => break, + } + }; + unsafe { + let elt = raw::slice_bytes(self.string, next, len); + self.string = raw::slice_bytes(self.string, 0, idx); + return Some(elt) } } self.finished = true; - if self.allow_trailing_empty || start < len { - Some(unsafe { raw::slice_bytes(self.string, start, len) }) + Some(self.string) + } +} + +impl<'self, Sep: CharEq> Iterator<&'self str> for CharSplitNIterator<'self, Sep> { + #[inline] + fn next(&mut self) -> Option<&'self str> { + if self.count != 0 { + self.count -= 1; + if self.invert { self.iter.next_back() } else { self.iter.next() } } else { - None + self.iter.get_end() } } } @@ -1271,9 +1341,10 @@ pub trait StrSlice<'self> { fn char_offset_iter(&self) -> CharOffsetIterator<'self>; fn char_offset_rev_iter(&self) -> CharOffsetRevIterator<'self>; fn split_iter(&self, sep: Sep) -> CharSplitIterator<'self, Sep>; - fn splitn_iter(&self, sep: Sep, count: uint) -> CharSplitIterator<'self, Sep>; - fn split_options_iter(&self, sep: Sep, count: uint, allow_trailing_empty: bool) - -> CharSplitIterator<'self, Sep>; + fn splitn_iter(&self, sep: Sep, count: uint) -> CharSplitNIterator<'self, Sep>; + fn split_terminator_iter(&self, sep: Sep) -> CharSplitIterator<'self, Sep>; + fn rsplit_iter(&self, sep: Sep) -> CharRSplitIterator<'self, Sep>; + fn rsplitn_iter(&self, sep: Sep, count: uint) -> CharSplitNIterator<'self, Sep>; fn matches_index_iter(&self, sep: &'self str) -> MatchesIndexIterator<'self>; fn split_str_iter(&self, &'self str) -> StrSplitIterator<'self>; fn line_iter(&self) -> CharSplitIterator<'self, char>; @@ -1410,40 +1481,78 @@ impl<'self> StrSlice<'self> for &'self str { /// ~~~ #[inline] fn split_iter(&self, sep: Sep) -> CharSplitIterator<'self, Sep> { - self.split_options_iter(sep, self.len(), true) + CharSplitIterator { + string: *self, + only_ascii: sep.only_ascii(), + sep: sep, + allow_trailing_empty: true, + finished: false, + } } /// An iterator over substrings of `self`, separated by characters /// matched by `sep`, restricted to splitting at most `count` /// times. #[inline] - fn splitn_iter(&self, sep: Sep, count: uint) -> CharSplitIterator<'self, Sep> { - self.split_options_iter(sep, count, true) + fn splitn_iter(&self, sep: Sep, count: uint) + -> CharSplitNIterator<'self, Sep> { + CharSplitNIterator { + iter: self.split_iter(sep), + count: count, + invert: false, + } } /// An iterator over substrings of `self`, separated by characters - /// matched by `sep`, splitting at most `count` times, and - /// possibly not including the trailing empty substring, if it - /// exists. + /// matched by `sep`. + /// + /// Equivalent to `split_iter`, except that the trailing substring + /// is skipped if empty (terminator semantics). + /// + /// # Example + /// + /// ~~~ {.rust} + /// let v: ~[&str] = "A.B.".split_terminator_iter('.').collect(); + /// assert_eq!(v, ~["A", "B"]); + /// ~~~ #[inline] - fn split_options_iter(&self, sep: Sep, count: uint, allow_trailing_empty: bool) + fn split_terminator_iter(&self, sep: Sep) -> CharSplitIterator<'self, Sep> { - let iter = if sep.only_ascii() { - ByteOffset(self.as_bytes().iter().enumerate()) - } else { - CharOffset(self.char_offset_iter()) - }; CharSplitIterator { - iter: iter, - string: *self, - position: 0, - sep: sep, - count: count, - allow_trailing_empty: allow_trailing_empty, - finished: false, + allow_trailing_empty: false, + ..self.split_iter(sep) } } + /// An iterator over substrings of `self`, separated by characters + /// matched by `sep`, in reverse order + /// + /// # Example + /// + /// ~~~ {.rust} + /// let v: ~[&str] = "Mary had a little lamb".rsplit_iter(' ').collect(); + /// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]); + /// ~~~ + #[inline] + fn rsplit_iter(&self, sep: Sep) -> CharRSplitIterator<'self, Sep> { + self.split_iter(sep).invert() + } + + /// An iterator over substrings of `self`, separated by characters + /// matched by `sep`, starting from the end of the string. + /// Restricted to splitting at most `count` times. + #[inline] + fn rsplitn_iter(&self, sep: Sep, count: uint) + -> CharSplitNIterator<'self, Sep> { + CharSplitNIterator { + iter: self.split_iter(sep), + count: count, + invert: true, + } + } + + + /// An iterator over the start and end indices of each match of /// `sep` within `self`. #[inline] @@ -1477,7 +1586,7 @@ impl<'self> StrSlice<'self> for &'self str { /// by `\n`). #[inline] fn line_iter(&self) -> CharSplitIterator<'self, char> { - self.split_options_iter('\n', self.len(), false) + self.split_terminator_iter('\n') } /// An iterator over the lines of a string, separated by either @@ -3371,17 +3480,33 @@ mod tests { let data = "\nMäry häd ä little lämb\nLittle lämb\n"; let split: ~[&str] = data.split_iter(' ').collect(); - assert_eq!(split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); + assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); + + let mut rsplit: ~[&str] = data.rsplit_iter(' ').collect(); + rsplit.reverse(); + assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); let split: ~[&str] = data.split_iter(|c: char| c == ' ').collect(); - assert_eq!(split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); + assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); + + let mut rsplit: ~[&str] = data.rsplit_iter(|c: char| c == ' ').collect(); + rsplit.reverse(); + assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); // Unicode let split: ~[&str] = data.split_iter('ä').collect(); - assert_eq!(split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); + assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); + + let mut rsplit: ~[&str] = data.rsplit_iter('ä').collect(); + rsplit.reverse(); + assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); let split: ~[&str] = data.split_iter(|c: char| c == 'ä').collect(); - assert_eq!(split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); + assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); + + let mut rsplit: ~[&str] = data.rsplit_iter(|c: char| c == 'ä').collect(); + rsplit.reverse(); + assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); } #[test] @@ -3402,14 +3527,49 @@ mod tests { assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]); } + #[test] + fn test_rsplitn_char_iterator() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let mut split: ~[&str] = data.rsplitn_iter(' ', 3).collect(); + split.reverse(); + assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]); + + let mut split: ~[&str] = data.rsplitn_iter(|c: char| c == ' ', 3).collect(); + split.reverse(); + assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]); + + // Unicode + let mut split: ~[&str] = data.rsplitn_iter('ä', 3).collect(); + split.reverse(); + assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]); + + let mut split: ~[&str] = data.rsplitn_iter(|c: char| c == 'ä', 3).collect(); + split.reverse(); + assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]); + } + #[test] fn test_split_char_iterator_no_trailing() { let data = "\nMäry häd ä little lämb\nLittle lämb\n"; - let split: ~[&str] = data.split_options_iter('\n', 1000, true).collect(); + let split: ~[&str] = data.split_iter('\n').collect(); assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]); - let split: ~[&str] = data.split_options_iter('\n', 1000, false).collect(); + let split: ~[&str] = data.split_terminator_iter('\n').collect(); + assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]); + } + + #[test] + fn test_rev_split_char_iterator_no_trailing() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let mut split: ~[&str] = data.split_iter('\n').invert().collect(); + split.reverse(); + assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]); + + let mut split: ~[&str] = data.split_terminator_iter('\n').invert().collect(); + split.reverse(); assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]); }