std::str: Double-ended CharSplitIterator
Add new methods `.rsplit_iter()` and `.rsplitn_iter()` for &str. Separate out CharSplitIterator and CharSplitNIterator, CharSplitIterator (`split_iter` and `rsplit_iter`) is made double-ended while `splitn_iter` and `rsplitn_iter` (limited to N splits) are not, since these don't have the same symmetry. With CharSplitIterator being double ended, derived iterators like `line_iter` and `word_iter` are too.
This commit is contained in:
parent
a17c7e4f2c
commit
b59d50368e
@ -21,9 +21,10 @@ use char;
|
||||
use char::Char;
|
||||
use clone::{Clone, DeepClone};
|
||||
use container::{Container, Mutable};
|
||||
use either::{Left, Right};
|
||||
use iter::Times;
|
||||
use iterator::{Iterator, FromIterator, Extendable};
|
||||
use iterator::{Filter, AdditiveIterator, Map, Enumerate};
|
||||
use iterator::{Filter, AdditiveIterator, Map};
|
||||
use iterator::{Invert, DoubleEndedIterator};
|
||||
use libc;
|
||||
use num::{Saturating, Zero};
|
||||
@ -359,28 +360,32 @@ pub type ByteIterator<'self> =
|
||||
/// Use with the `std::iterator` module.
|
||||
pub type ByteRevIterator<'self> = Invert<ByteIterator<'self>>;
|
||||
|
||||
/// An iterator over byte index and either &u8 or char
|
||||
#[deriving(Clone)]
|
||||
enum OffsetIterator<'self> {
|
||||
// use ByteIterator here when it can be cloned
|
||||
ByteOffset(Enumerate<vec::VecIterator<'self, u8>>),
|
||||
CharOffset(CharOffsetIterator<'self>),
|
||||
}
|
||||
|
||||
/// An iterator over the substrings of a string, separated by `sep`.
|
||||
#[deriving(Clone)]
|
||||
pub struct CharSplitIterator<'self,Sep> {
|
||||
priv iter: OffsetIterator<'self>,
|
||||
pub struct CharSplitIterator<'self, Sep> {
|
||||
/// The slice remaining to be iterated
|
||||
priv string: &'self str,
|
||||
priv position: uint,
|
||||
priv sep: Sep,
|
||||
/// The number of splits remaining
|
||||
priv count: uint,
|
||||
/// Whether an empty string at the end is allowed
|
||||
priv allow_trailing_empty: bool,
|
||||
priv only_ascii: bool,
|
||||
priv finished: bool,
|
||||
}
|
||||
|
||||
/// An iterator over the substrings of a string, separated by `sep`,
|
||||
/// starting from the back of the string.
|
||||
pub type CharRSplitIterator<'self, Sep> = Invert<CharSplitIterator<'self, Sep>>;
|
||||
|
||||
/// An iterator over the substrings of a string, separated by `sep`,
|
||||
/// splitting at most `count` times.
|
||||
#[deriving(Clone)]
|
||||
pub struct CharSplitNIterator<'self, Sep> {
|
||||
priv iter: CharSplitIterator<'self, Sep>,
|
||||
/// The number of splits remaining
|
||||
priv count: uint,
|
||||
priv invert: bool,
|
||||
}
|
||||
|
||||
/// An iterator over the words of a string, separated by an sequence of whitespace
|
||||
pub type WordIterator<'self> =
|
||||
Filter<'self, &'self str, CharSplitIterator<'self, extern "Rust" fn(char) -> bool>>;
|
||||
@ -389,46 +394,111 @@ pub type WordIterator<'self> =
|
||||
pub type AnyLineIterator<'self> =
|
||||
Map<'self, &'self str, &'self str, CharSplitIterator<'self, char>>;
|
||||
|
||||
impl<'self, Sep> CharSplitIterator<'self, Sep> {
|
||||
#[inline]
|
||||
fn get_end(&mut self) -> Option<&'self str> {
|
||||
if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) {
|
||||
self.finished = true;
|
||||
Some(self.string)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'self, Sep: CharEq> Iterator<&'self str> for CharSplitIterator<'self, Sep> {
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'self str> {
|
||||
if self.finished { return None }
|
||||
|
||||
let start = self.position;
|
||||
let len = self.string.len();
|
||||
let mut iter = match self.only_ascii {
|
||||
true => Left(self.string.byte_iter().enumerate()),
|
||||
false => Right(self.string.char_offset_iter())
|
||||
};
|
||||
|
||||
if self.count > 0 {
|
||||
match self.iter {
|
||||
loop {
|
||||
let (idx, next) = match iter {
|
||||
// this gives a *huge* speed up for splitting on ASCII
|
||||
// characters (e.g. '\n' or ' ')
|
||||
ByteOffset(ref mut iter) =>
|
||||
for (idx, &byte) in *iter {
|
||||
if self.sep.matches(byte as char) {
|
||||
self.position = idx + 1;
|
||||
self.count -= 1;
|
||||
return Some(unsafe {
|
||||
raw::slice_bytes(self.string, start, idx)
|
||||
})
|
||||
}
|
||||
},
|
||||
CharOffset(ref mut iter) =>
|
||||
for (idx, ch) in *iter {
|
||||
if self.sep.matches(ch) {
|
||||
// skip over the separator
|
||||
self.position = self.string.char_range_at(idx).next;
|
||||
self.count -= 1;
|
||||
return Some(unsafe {
|
||||
raw::slice_bytes(self.string, start, idx)
|
||||
})
|
||||
}
|
||||
Left(ref mut it) => match it.next() {
|
||||
Some((idx, byte)) if byte < 128u8 && self.sep.matches(byte as char) =>
|
||||
(idx, idx + 1),
|
||||
Some(*) => loop,
|
||||
None => break,
|
||||
},
|
||||
Right(ref mut it) => match it.next() {
|
||||
Some((idx, ch)) if self.sep.matches(ch) =>
|
||||
(idx, self.string.char_range_at(idx).next),
|
||||
Some(*) => loop,
|
||||
None => break,
|
||||
}
|
||||
};
|
||||
unsafe {
|
||||
let elt = raw::slice_bytes(self.string, 0, idx);
|
||||
self.string = raw::slice_bytes(self.string, next, len);
|
||||
return Some(elt)
|
||||
}
|
||||
}
|
||||
self.get_end()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'self, Sep: CharEq> DoubleEndedIterator<&'self str>
|
||||
for CharSplitIterator<'self, Sep> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<&'self str> {
|
||||
if self.finished { return None }
|
||||
|
||||
if !self.allow_trailing_empty {
|
||||
self.allow_trailing_empty = true;
|
||||
match self.next_back() {
|
||||
Some(elt) if !elt.is_empty() => return Some(elt),
|
||||
_ => if self.finished { return None }
|
||||
}
|
||||
}
|
||||
let len = self.string.len();
|
||||
let mut iter = match self.only_ascii {
|
||||
true => Left(self.string.byte_rev_iter().enumerate()),
|
||||
false => Right(self.string.char_offset_iter())
|
||||
};
|
||||
|
||||
loop {
|
||||
let (idx, next) = match iter {
|
||||
Left(ref mut it) => match it.next() {
|
||||
Some((j, byte)) if byte < 128u8 && self.sep.matches(byte as char) => {
|
||||
let idx = self.string.len() - j - 1;
|
||||
(idx, idx + 1)
|
||||
},
|
||||
Some(*) => loop,
|
||||
None => break,
|
||||
},
|
||||
Right(ref mut it) => match it.next_back() {
|
||||
Some((idx, ch)) if self.sep.matches(ch) =>
|
||||
(idx, self.string.char_range_at(idx).next),
|
||||
Some(*) => loop,
|
||||
None => break,
|
||||
}
|
||||
};
|
||||
unsafe {
|
||||
let elt = raw::slice_bytes(self.string, next, len);
|
||||
self.string = raw::slice_bytes(self.string, 0, idx);
|
||||
return Some(elt)
|
||||
}
|
||||
}
|
||||
self.finished = true;
|
||||
if self.allow_trailing_empty || start < len {
|
||||
Some(unsafe { raw::slice_bytes(self.string, start, len) })
|
||||
Some(self.string)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'self, Sep: CharEq> Iterator<&'self str> for CharSplitNIterator<'self, Sep> {
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'self str> {
|
||||
if self.count != 0 {
|
||||
self.count -= 1;
|
||||
if self.invert { self.iter.next_back() } else { self.iter.next() }
|
||||
} else {
|
||||
None
|
||||
self.iter.get_end()
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1271,9 +1341,10 @@ pub trait StrSlice<'self> {
|
||||
fn char_offset_iter(&self) -> CharOffsetIterator<'self>;
|
||||
fn char_offset_rev_iter(&self) -> CharOffsetRevIterator<'self>;
|
||||
fn split_iter<Sep: CharEq>(&self, sep: Sep) -> CharSplitIterator<'self, Sep>;
|
||||
fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitIterator<'self, Sep>;
|
||||
fn split_options_iter<Sep: CharEq>(&self, sep: Sep, count: uint, allow_trailing_empty: bool)
|
||||
-> CharSplitIterator<'self, Sep>;
|
||||
fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitNIterator<'self, Sep>;
|
||||
fn split_terminator_iter<Sep: CharEq>(&self, sep: Sep) -> CharSplitIterator<'self, Sep>;
|
||||
fn rsplit_iter<Sep: CharEq>(&self, sep: Sep) -> CharRSplitIterator<'self, Sep>;
|
||||
fn rsplitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitNIterator<'self, Sep>;
|
||||
fn matches_index_iter(&self, sep: &'self str) -> MatchesIndexIterator<'self>;
|
||||
fn split_str_iter(&self, &'self str) -> StrSplitIterator<'self>;
|
||||
fn line_iter(&self) -> CharSplitIterator<'self, char>;
|
||||
@ -1410,40 +1481,78 @@ impl<'self> StrSlice<'self> for &'self str {
|
||||
/// ~~~
|
||||
#[inline]
|
||||
fn split_iter<Sep: CharEq>(&self, sep: Sep) -> CharSplitIterator<'self, Sep> {
|
||||
self.split_options_iter(sep, self.len(), true)
|
||||
CharSplitIterator {
|
||||
string: *self,
|
||||
only_ascii: sep.only_ascii(),
|
||||
sep: sep,
|
||||
allow_trailing_empty: true,
|
||||
finished: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over substrings of `self`, separated by characters
|
||||
/// matched by `sep`, restricted to splitting at most `count`
|
||||
/// times.
|
||||
#[inline]
|
||||
fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitIterator<'self, Sep> {
|
||||
self.split_options_iter(sep, count, true)
|
||||
fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint)
|
||||
-> CharSplitNIterator<'self, Sep> {
|
||||
CharSplitNIterator {
|
||||
iter: self.split_iter(sep),
|
||||
count: count,
|
||||
invert: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over substrings of `self`, separated by characters
|
||||
/// matched by `sep`, splitting at most `count` times, and
|
||||
/// possibly not including the trailing empty substring, if it
|
||||
/// exists.
|
||||
/// matched by `sep`.
|
||||
///
|
||||
/// Equivalent to `split_iter`, except that the trailing substring
|
||||
/// is skipped if empty (terminator semantics).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ~~~ {.rust}
|
||||
/// let v: ~[&str] = "A.B.".split_terminator_iter('.').collect();
|
||||
/// assert_eq!(v, ~["A", "B"]);
|
||||
/// ~~~
|
||||
#[inline]
|
||||
fn split_options_iter<Sep: CharEq>(&self, sep: Sep, count: uint, allow_trailing_empty: bool)
|
||||
fn split_terminator_iter<Sep: CharEq>(&self, sep: Sep)
|
||||
-> CharSplitIterator<'self, Sep> {
|
||||
let iter = if sep.only_ascii() {
|
||||
ByteOffset(self.as_bytes().iter().enumerate())
|
||||
} else {
|
||||
CharOffset(self.char_offset_iter())
|
||||
};
|
||||
CharSplitIterator {
|
||||
iter: iter,
|
||||
string: *self,
|
||||
position: 0,
|
||||
sep: sep,
|
||||
count: count,
|
||||
allow_trailing_empty: allow_trailing_empty,
|
||||
finished: false,
|
||||
allow_trailing_empty: false,
|
||||
..self.split_iter(sep)
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over substrings of `self`, separated by characters
|
||||
/// matched by `sep`, in reverse order
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ~~~ {.rust}
|
||||
/// let v: ~[&str] = "Mary had a little lamb".rsplit_iter(' ').collect();
|
||||
/// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]);
|
||||
/// ~~~
|
||||
#[inline]
|
||||
fn rsplit_iter<Sep: CharEq>(&self, sep: Sep) -> CharRSplitIterator<'self, Sep> {
|
||||
self.split_iter(sep).invert()
|
||||
}
|
||||
|
||||
/// An iterator over substrings of `self`, separated by characters
|
||||
/// matched by `sep`, starting from the end of the string.
|
||||
/// Restricted to splitting at most `count` times.
|
||||
#[inline]
|
||||
fn rsplitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint)
|
||||
-> CharSplitNIterator<'self, Sep> {
|
||||
CharSplitNIterator {
|
||||
iter: self.split_iter(sep),
|
||||
count: count,
|
||||
invert: true,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// An iterator over the start and end indices of each match of
|
||||
/// `sep` within `self`.
|
||||
#[inline]
|
||||
@ -1477,7 +1586,7 @@ impl<'self> StrSlice<'self> for &'self str {
|
||||
/// by `\n`).
|
||||
#[inline]
|
||||
fn line_iter(&self) -> CharSplitIterator<'self, char> {
|
||||
self.split_options_iter('\n', self.len(), false)
|
||||
self.split_terminator_iter('\n')
|
||||
}
|
||||
|
||||
/// An iterator over the lines of a string, separated by either
|
||||
@ -3371,17 +3480,33 @@ mod tests {
|
||||
let data = "\nMäry häd ä little lämb\nLittle lämb\n";
|
||||
|
||||
let split: ~[&str] = data.split_iter(' ').collect();
|
||||
assert_eq!(split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
|
||||
assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
|
||||
|
||||
let mut rsplit: ~[&str] = data.rsplit_iter(' ').collect();
|
||||
rsplit.reverse();
|
||||
assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
|
||||
|
||||
let split: ~[&str] = data.split_iter(|c: char| c == ' ').collect();
|
||||
assert_eq!(split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
|
||||
assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
|
||||
|
||||
let mut rsplit: ~[&str] = data.rsplit_iter(|c: char| c == ' ').collect();
|
||||
rsplit.reverse();
|
||||
assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
|
||||
|
||||
// Unicode
|
||||
let split: ~[&str] = data.split_iter('ä').collect();
|
||||
assert_eq!(split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
|
||||
assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
|
||||
|
||||
let mut rsplit: ~[&str] = data.rsplit_iter('ä').collect();
|
||||
rsplit.reverse();
|
||||
assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
|
||||
|
||||
let split: ~[&str] = data.split_iter(|c: char| c == 'ä').collect();
|
||||
assert_eq!(split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
|
||||
assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
|
||||
|
||||
let mut rsplit: ~[&str] = data.rsplit_iter(|c: char| c == 'ä').collect();
|
||||
rsplit.reverse();
|
||||
assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -3402,14 +3527,49 @@ mod tests {
|
||||
assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rsplitn_char_iterator() {
|
||||
let data = "\nMäry häd ä little lämb\nLittle lämb\n";
|
||||
|
||||
let mut split: ~[&str] = data.rsplitn_iter(' ', 3).collect();
|
||||
split.reverse();
|
||||
assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
|
||||
|
||||
let mut split: ~[&str] = data.rsplitn_iter(|c: char| c == ' ', 3).collect();
|
||||
split.reverse();
|
||||
assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
|
||||
|
||||
// Unicode
|
||||
let mut split: ~[&str] = data.rsplitn_iter('ä', 3).collect();
|
||||
split.reverse();
|
||||
assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
|
||||
|
||||
let mut split: ~[&str] = data.rsplitn_iter(|c: char| c == 'ä', 3).collect();
|
||||
split.reverse();
|
||||
assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_char_iterator_no_trailing() {
|
||||
let data = "\nMäry häd ä little lämb\nLittle lämb\n";
|
||||
|
||||
let split: ~[&str] = data.split_options_iter('\n', 1000, true).collect();
|
||||
let split: ~[&str] = data.split_iter('\n').collect();
|
||||
assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
|
||||
|
||||
let split: ~[&str] = data.split_options_iter('\n', 1000, false).collect();
|
||||
let split: ~[&str] = data.split_terminator_iter('\n').collect();
|
||||
assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rev_split_char_iterator_no_trailing() {
|
||||
let data = "\nMäry häd ä little lämb\nLittle lämb\n";
|
||||
|
||||
let mut split: ~[&str] = data.split_iter('\n').invert().collect();
|
||||
split.reverse();
|
||||
assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
|
||||
|
||||
let mut split: ~[&str] = data.split_terminator_iter('\n').invert().collect();
|
||||
split.reverse();
|
||||
assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user