std::str: Double-ended CharSplitIterator

Add new methods `.rsplit_iter()` and `.rsplitn_iter()` for &str.

Separate out CharSplitIterator and CharSplitNIterator,
CharSplitIterator (`split_iter` and `rsplit_iter`) is made double-ended
while `splitn_iter` and `rsplitn_iter` (limited to N splits) are not,
since these don't have the same symmetry.

With CharSplitIterator being double ended, derived iterators like
`line_iter` and `word_iter` are too.
This commit is contained in:
blake2-ppc 2013-08-25 08:54:47 +02:00
parent a17c7e4f2c
commit b59d50368e

View File

@ -21,9 +21,10 @@ use char;
use char::Char;
use clone::{Clone, DeepClone};
use container::{Container, Mutable};
use either::{Left, Right};
use iter::Times;
use iterator::{Iterator, FromIterator, Extendable};
use iterator::{Filter, AdditiveIterator, Map, Enumerate};
use iterator::{Filter, AdditiveIterator, Map};
use iterator::{Invert, DoubleEndedIterator};
use libc;
use num::{Saturating, Zero};
@ -359,28 +360,32 @@ pub type ByteIterator<'self> =
/// Use with the `std::iterator` module.
pub type ByteRevIterator<'self> = Invert<ByteIterator<'self>>;
/// An iterator over byte index and either &u8 or char
#[deriving(Clone)]
enum OffsetIterator<'self> {
// use ByteIterator here when it can be cloned
ByteOffset(Enumerate<vec::VecIterator<'self, u8>>),
CharOffset(CharOffsetIterator<'self>),
}
/// An iterator over the substrings of a string, separated by `sep`.
#[deriving(Clone)]
pub struct CharSplitIterator<'self,Sep> {
priv iter: OffsetIterator<'self>,
pub struct CharSplitIterator<'self, Sep> {
/// The slice remaining to be iterated
priv string: &'self str,
priv position: uint,
priv sep: Sep,
/// The number of splits remaining
priv count: uint,
/// Whether an empty string at the end is allowed
priv allow_trailing_empty: bool,
priv only_ascii: bool,
priv finished: bool,
}
/// An iterator over the substrings of a string, separated by `sep`,
/// starting from the back of the string.
pub type CharRSplitIterator<'self, Sep> = Invert<CharSplitIterator<'self, Sep>>;
/// An iterator over the substrings of a string, separated by `sep`,
/// splitting at most `count` times.
#[deriving(Clone)]
pub struct CharSplitNIterator<'self, Sep> {
priv iter: CharSplitIterator<'self, Sep>,
/// The number of splits remaining
priv count: uint,
priv invert: bool,
}
/// An iterator over the words of a string, separated by an sequence of whitespace
pub type WordIterator<'self> =
Filter<'self, &'self str, CharSplitIterator<'self, extern "Rust" fn(char) -> bool>>;
@ -389,46 +394,111 @@ pub type WordIterator<'self> =
pub type AnyLineIterator<'self> =
Map<'self, &'self str, &'self str, CharSplitIterator<'self, char>>;
impl<'self, Sep> CharSplitIterator<'self, Sep> {
#[inline]
fn get_end(&mut self) -> Option<&'self str> {
if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) {
self.finished = true;
Some(self.string)
} else {
None
}
}
}
impl<'self, Sep: CharEq> Iterator<&'self str> for CharSplitIterator<'self, Sep> {
#[inline]
fn next(&mut self) -> Option<&'self str> {
if self.finished { return None }
let start = self.position;
let len = self.string.len();
let mut iter = match self.only_ascii {
true => Left(self.string.byte_iter().enumerate()),
false => Right(self.string.char_offset_iter())
};
if self.count > 0 {
match self.iter {
loop {
let (idx, next) = match iter {
// this gives a *huge* speed up for splitting on ASCII
// characters (e.g. '\n' or ' ')
ByteOffset(ref mut iter) =>
for (idx, &byte) in *iter {
if self.sep.matches(byte as char) {
self.position = idx + 1;
self.count -= 1;
return Some(unsafe {
raw::slice_bytes(self.string, start, idx)
})
}
},
CharOffset(ref mut iter) =>
for (idx, ch) in *iter {
if self.sep.matches(ch) {
// skip over the separator
self.position = self.string.char_range_at(idx).next;
self.count -= 1;
return Some(unsafe {
raw::slice_bytes(self.string, start, idx)
})
}
Left(ref mut it) => match it.next() {
Some((idx, byte)) if byte < 128u8 && self.sep.matches(byte as char) =>
(idx, idx + 1),
Some(*) => loop,
None => break,
},
Right(ref mut it) => match it.next() {
Some((idx, ch)) if self.sep.matches(ch) =>
(idx, self.string.char_range_at(idx).next),
Some(*) => loop,
None => break,
}
};
unsafe {
let elt = raw::slice_bytes(self.string, 0, idx);
self.string = raw::slice_bytes(self.string, next, len);
return Some(elt)
}
}
self.get_end()
}
}
impl<'self, Sep: CharEq> DoubleEndedIterator<&'self str>
for CharSplitIterator<'self, Sep> {
#[inline]
fn next_back(&mut self) -> Option<&'self str> {
if self.finished { return None }
if !self.allow_trailing_empty {
self.allow_trailing_empty = true;
match self.next_back() {
Some(elt) if !elt.is_empty() => return Some(elt),
_ => if self.finished { return None }
}
}
let len = self.string.len();
let mut iter = match self.only_ascii {
true => Left(self.string.byte_rev_iter().enumerate()),
false => Right(self.string.char_offset_iter())
};
loop {
let (idx, next) = match iter {
Left(ref mut it) => match it.next() {
Some((j, byte)) if byte < 128u8 && self.sep.matches(byte as char) => {
let idx = self.string.len() - j - 1;
(idx, idx + 1)
},
Some(*) => loop,
None => break,
},
Right(ref mut it) => match it.next_back() {
Some((idx, ch)) if self.sep.matches(ch) =>
(idx, self.string.char_range_at(idx).next),
Some(*) => loop,
None => break,
}
};
unsafe {
let elt = raw::slice_bytes(self.string, next, len);
self.string = raw::slice_bytes(self.string, 0, idx);
return Some(elt)
}
}
self.finished = true;
if self.allow_trailing_empty || start < len {
Some(unsafe { raw::slice_bytes(self.string, start, len) })
Some(self.string)
}
}
impl<'self, Sep: CharEq> Iterator<&'self str> for CharSplitNIterator<'self, Sep> {
#[inline]
fn next(&mut self) -> Option<&'self str> {
if self.count != 0 {
self.count -= 1;
if self.invert { self.iter.next_back() } else { self.iter.next() }
} else {
None
self.iter.get_end()
}
}
}
@ -1271,9 +1341,10 @@ pub trait StrSlice<'self> {
fn char_offset_iter(&self) -> CharOffsetIterator<'self>;
fn char_offset_rev_iter(&self) -> CharOffsetRevIterator<'self>;
fn split_iter<Sep: CharEq>(&self, sep: Sep) -> CharSplitIterator<'self, Sep>;
fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitIterator<'self, Sep>;
fn split_options_iter<Sep: CharEq>(&self, sep: Sep, count: uint, allow_trailing_empty: bool)
-> CharSplitIterator<'self, Sep>;
fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitNIterator<'self, Sep>;
fn split_terminator_iter<Sep: CharEq>(&self, sep: Sep) -> CharSplitIterator<'self, Sep>;
fn rsplit_iter<Sep: CharEq>(&self, sep: Sep) -> CharRSplitIterator<'self, Sep>;
fn rsplitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitNIterator<'self, Sep>;
fn matches_index_iter(&self, sep: &'self str) -> MatchesIndexIterator<'self>;
fn split_str_iter(&self, &'self str) -> StrSplitIterator<'self>;
fn line_iter(&self) -> CharSplitIterator<'self, char>;
@ -1410,40 +1481,78 @@ impl<'self> StrSlice<'self> for &'self str {
/// ~~~
#[inline]
fn split_iter<Sep: CharEq>(&self, sep: Sep) -> CharSplitIterator<'self, Sep> {
self.split_options_iter(sep, self.len(), true)
CharSplitIterator {
string: *self,
only_ascii: sep.only_ascii(),
sep: sep,
allow_trailing_empty: true,
finished: false,
}
}
/// An iterator over substrings of `self`, separated by characters
/// matched by `sep`, restricted to splitting at most `count`
/// times.
#[inline]
fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitIterator<'self, Sep> {
self.split_options_iter(sep, count, true)
fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint)
-> CharSplitNIterator<'self, Sep> {
CharSplitNIterator {
iter: self.split_iter(sep),
count: count,
invert: false,
}
}
/// An iterator over substrings of `self`, separated by characters
/// matched by `sep`, splitting at most `count` times, and
/// possibly not including the trailing empty substring, if it
/// exists.
/// matched by `sep`.
///
/// Equivalent to `split_iter`, except that the trailing substring
/// is skipped if empty (terminator semantics).
///
/// # Example
///
/// ~~~ {.rust}
/// let v: ~[&str] = "A.B.".split_terminator_iter('.').collect();
/// assert_eq!(v, ~["A", "B"]);
/// ~~~
#[inline]
fn split_options_iter<Sep: CharEq>(&self, sep: Sep, count: uint, allow_trailing_empty: bool)
fn split_terminator_iter<Sep: CharEq>(&self, sep: Sep)
-> CharSplitIterator<'self, Sep> {
let iter = if sep.only_ascii() {
ByteOffset(self.as_bytes().iter().enumerate())
} else {
CharOffset(self.char_offset_iter())
};
CharSplitIterator {
iter: iter,
string: *self,
position: 0,
sep: sep,
count: count,
allow_trailing_empty: allow_trailing_empty,
finished: false,
allow_trailing_empty: false,
..self.split_iter(sep)
}
}
/// An iterator over substrings of `self`, separated by characters
/// matched by `sep`, in reverse order
///
/// # Example
///
/// ~~~ {.rust}
/// let v: ~[&str] = "Mary had a little lamb".rsplit_iter(' ').collect();
/// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]);
/// ~~~
#[inline]
fn rsplit_iter<Sep: CharEq>(&self, sep: Sep) -> CharRSplitIterator<'self, Sep> {
self.split_iter(sep).invert()
}
/// An iterator over substrings of `self`, separated by characters
/// matched by `sep`, starting from the end of the string.
/// Restricted to splitting at most `count` times.
#[inline]
fn rsplitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint)
-> CharSplitNIterator<'self, Sep> {
CharSplitNIterator {
iter: self.split_iter(sep),
count: count,
invert: true,
}
}
/// An iterator over the start and end indices of each match of
/// `sep` within `self`.
#[inline]
@ -1477,7 +1586,7 @@ impl<'self> StrSlice<'self> for &'self str {
/// by `\n`).
#[inline]
fn line_iter(&self) -> CharSplitIterator<'self, char> {
self.split_options_iter('\n', self.len(), false)
self.split_terminator_iter('\n')
}
/// An iterator over the lines of a string, separated by either
@ -3371,17 +3480,33 @@ mod tests {
let data = "\nMäry häd ä little lämb\nLittle lämb\n";
let split: ~[&str] = data.split_iter(' ').collect();
assert_eq!(split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
let mut rsplit: ~[&str] = data.rsplit_iter(' ').collect();
rsplit.reverse();
assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
let split: ~[&str] = data.split_iter(|c: char| c == ' ').collect();
assert_eq!(split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
let mut rsplit: ~[&str] = data.rsplit_iter(|c: char| c == ' ').collect();
rsplit.reverse();
assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
// Unicode
let split: ~[&str] = data.split_iter('ä').collect();
assert_eq!(split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
let mut rsplit: ~[&str] = data.rsplit_iter('ä').collect();
rsplit.reverse();
assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
let split: ~[&str] = data.split_iter(|c: char| c == 'ä').collect();
assert_eq!(split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
let mut rsplit: ~[&str] = data.rsplit_iter(|c: char| c == 'ä').collect();
rsplit.reverse();
assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
}
#[test]
@ -3402,14 +3527,49 @@ mod tests {
assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
}
#[test]
fn test_rsplitn_char_iterator() {
let data = "\nMäry häd ä little lämb\nLittle lämb\n";
let mut split: ~[&str] = data.rsplitn_iter(' ', 3).collect();
split.reverse();
assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
let mut split: ~[&str] = data.rsplitn_iter(|c: char| c == ' ', 3).collect();
split.reverse();
assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
// Unicode
let mut split: ~[&str] = data.rsplitn_iter('ä', 3).collect();
split.reverse();
assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
let mut split: ~[&str] = data.rsplitn_iter(|c: char| c == 'ä', 3).collect();
split.reverse();
assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
}
#[test]
fn test_split_char_iterator_no_trailing() {
let data = "\nMäry häd ä little lämb\nLittle lämb\n";
let split: ~[&str] = data.split_options_iter('\n', 1000, true).collect();
let split: ~[&str] = data.split_iter('\n').collect();
assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
let split: ~[&str] = data.split_options_iter('\n', 1000, false).collect();
let split: ~[&str] = data.split_terminator_iter('\n').collect();
assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
}
#[test]
fn test_rev_split_char_iterator_no_trailing() {
let data = "\nMäry häd ä little lämb\nLittle lämb\n";
let mut split: ~[&str] = data.split_iter('\n').invert().collect();
split.reverse();
assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
let mut split: ~[&str] = data.split_terminator_iter('\n').invert().collect();
split.reverse();
assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
}