std: remove substr & str::count_*, methodise char_len, implement slice_chars.

The confusing mixture of byte index and character count meant that every
use of .substr was incorrect; replaced by slice_chars which only uses
character indices. The old behaviour of `.substr(start, n)` can be emulated
via `.slice_from(start).slice_chars(0, n)`.
This commit is contained in:
Huon Wilson 2013-06-11 21:37:22 +10:00
parent 9fff8c6eba
commit 3ac00a9489
10 changed files with 76 additions and 104 deletions

View File

@ -1410,7 +1410,7 @@ let new_favorite_crayon_name = favorite_crayon_name.trim();
if favorite_crayon_name.len() > 5 { if favorite_crayon_name.len() > 5 {
// Create a substring // Create a substring
println(favorite_crayon_name.substr(0, 5)); println(favorite_crayon_name.slice_chars(0, 5));
} }
~~~ ~~~

View File

@ -84,9 +84,9 @@ pub fn of_str(str: @~str) -> Rope {
* *
* # Return value * # Return value
* *
* A rope representing the same string as `str.substr(byte_offset, * A rope representing the same string as `str.slice(byte_offset,
* byte_len)`. Depending on `byte_len`, this rope may be empty, flat * byte_offset + byte_len)`. Depending on `byte_len`, this rope may
* or complex. * be empty, flat or complex.
* *
* # Performance note * # Performance note
* *
@ -588,7 +588,7 @@ pub mod node {
* * char_len - The number of chars in the leaf. * * char_len - The number of chars in the leaf.
* * content - Contents of the leaf. * * content - Contents of the leaf.
* *
* Note that we can have `char_len < str::char_len(content)`, if * Note that we can have `char_len < content.char_len()`, if
* this leaf is only a subset of the string. Also note that the * this leaf is only a subset of the string. Also note that the
* string can be shared between several ropes, e.g. for indexing * string can be shared between several ropes, e.g. for indexing
* purposes. * purposes.
@ -680,7 +680,7 @@ pub mod node {
*/ */
pub fn of_substr(str: @~str, byte_start: uint, byte_len: uint) -> @Node { pub fn of_substr(str: @~str, byte_start: uint, byte_len: uint) -> @Node {
return of_substr_unsafer(str, byte_start, byte_len, return of_substr_unsafer(str, byte_start, byte_len,
str::count_chars(*str, byte_start, byte_len)); str.slice(byte_start, byte_start + byte_len).char_len());
} }
/** /**
@ -734,7 +734,7 @@ pub mod node {
if i == 0u { first_leaf_char_len } if i == 0u { first_leaf_char_len }
else { hint_max_leaf_char_len }; else { hint_max_leaf_char_len };
let chunk_byte_len = let chunk_byte_len =
str::count_bytes(*str, offset, chunk_char_len); str.slice_from(offset).slice_chars(0, chunk_char_len).len();
nodes[i] = @Leaf(Leaf { nodes[i] = @Leaf(Leaf {
byte_offset: offset, byte_offset: offset,
byte_len: chunk_byte_len, byte_len: chunk_byte_len,
@ -938,7 +938,7 @@ pub mod node {
match (*node) { match (*node) {
node::Leaf(x) => { node::Leaf(x) => {
let char_len = let char_len =
str::count_chars(*x.content, byte_offset, byte_len); x.content.slice(byte_offset, byte_offset + byte_len).char_len();
return @Leaf(Leaf { return @Leaf(Leaf {
byte_offset: byte_offset, byte_offset: byte_offset,
byte_len: byte_len, byte_len: byte_len,
@ -1002,9 +1002,9 @@ pub mod node {
return node; return node;
} }
let byte_offset = let byte_offset =
str::count_bytes(*x.content, 0u, char_offset); x.content.slice_chars(0, char_offset).len();
let byte_len = let byte_len =
str::count_bytes(*x.content, byte_offset, char_len); x.content.slice_from(byte_offset).slice_chars(0, char_len).len();
return @Leaf(Leaf { return @Leaf(Leaf {
byte_offset: byte_offset, byte_offset: byte_offset,
byte_len: byte_len, byte_len: byte_len,
@ -1312,7 +1312,7 @@ mod tests {
let sample = @~"0123456789ABCDE"; let sample = @~"0123456789ABCDE";
let r = of_str(sample); let r = of_str(sample);
assert_eq!(char_len(r), str::char_len(*sample)); assert_eq!(char_len(r), sample.char_len());
assert!(rope_to_string(r) == *sample); assert!(rope_to_string(r) == *sample);
} }
@ -1328,7 +1328,7 @@ mod tests {
} }
let sample = @copy *buf; let sample = @copy *buf;
let r = of_str(sample); let r = of_str(sample);
assert!(char_len(r) == str::char_len(*sample)); assert_eq!(char_len(r), sample.char_len());
assert!(rope_to_string(r) == *sample); assert!(rope_to_string(r) == *sample);
let mut string_iter = 0u; let mut string_iter = 0u;
@ -1374,7 +1374,7 @@ mod tests {
} }
} }
assert_eq!(len, str::char_len(*sample)); assert_eq!(len, sample.char_len());
} }
#[test] #[test]

View File

@ -27,7 +27,7 @@ pub fn get_dbpath_for_term(term: &str) -> Option<~path> {
let homedir = os::homedir(); let homedir = os::homedir();
let mut dirs_to_search = ~[]; let mut dirs_to_search = ~[];
let first_char = term.substr(0, 1); let first_char = term.char_at(0);
// Find search directory // Find search directory
match getenv("TERMINFO") { match getenv("TERMINFO") {
@ -57,12 +57,12 @@ pub fn get_dbpath_for_term(term: &str) -> Option<~path> {
// Look for the terminal in all of the search directories // Look for the terminal in all of the search directories
for dirs_to_search.each |p| { for dirs_to_search.each |p| {
let newp = ~p.push_many(&[first_char.to_owned(), term.to_owned()]); let newp = ~p.push_many(&[str::from_char(first_char), term.to_owned()]);
if os::path_exists(p) && os::path_exists(newp) { if os::path_exists(p) && os::path_exists(newp) {
return Some(newp); return Some(newp);
} }
// on some installations the dir is named after the hex of the char (e.g. OS X) // on some installations the dir is named after the hex of the char (e.g. OS X)
let newp = ~p.push_many(&[fmt!("%x", first_char[0] as uint), term.to_owned()]); let newp = ~p.push_many(&[fmt!("%x", first_char as uint), term.to_owned()]);
if os::path_exists(p) && os::path_exists(newp) { if os::path_exists(p) && os::path_exists(newp) {
return Some(newp); return Some(newp);
} }

View File

@ -590,7 +590,7 @@ pub fn build_session_options(binary: @~str,
// FIXME: #4318 Instead of to_ascii and to_str_ascii, could use // FIXME: #4318 Instead of to_ascii and to_str_ascii, could use
// to_ascii_consume and to_str_consume to not do a unnecessary copy. // to_ascii_consume and to_str_consume to not do a unnecessary copy.
let level_short = level_name.substr(0,1); let level_short = level_name.slice_chars(0, 1);
let level_short = level_short.to_ascii().to_upper().to_str_ascii(); let level_short = level_short.to_ascii().to_upper().to_str_ascii();
let flags = vec::append(getopts::opt_strs(matches, level_short), let flags = vec::append(getopts::opt_strs(matches, level_short),
getopts::opt_strs(matches, level_name)); getopts::opt_strs(matches, level_name));

View File

@ -2678,14 +2678,14 @@ impl Resolver {
match module_prefix_result { match module_prefix_result {
Failed => { Failed => {
let mpath = self.idents_to_str(module_path); let mpath = self.idents_to_str(module_path);
match self.idents_to_str(module_path).rfind(':') { match mpath.rfind(':') {
Some(idx) => { Some(idx) => {
self.session.span_err(span, fmt!("unresolved import: could not find `%s` \ self.session.span_err(span, fmt!("unresolved import: could not find `%s` \
in `%s`", mpath.substr(idx, in `%s`",
mpath.len() - idx), // idx +- 1 to account for the colons
// idx - 1 to account for the extra // on either side
// colon mpath.slice_from(idx + 1),
mpath.substr(0, idx - 1))); mpath.slice_to(idx - 1)));
}, },
None => (), None => (),
}; };

View File

@ -1704,5 +1704,5 @@ fn trans_assign_op(bcx: block,
} }
fn shorten(x: ~str) -> ~str { fn shorten(x: ~str) -> ~str {
if x.len() > 60 { x.substr(0, 60).to_owned() } else { x } if x.char_len() > 60 { x.slice_chars(0, 60).to_owned() } else { x }
} }

View File

@ -284,7 +284,7 @@ fn run_cmd(repl: &mut Repl, _in: @io::Reader, _out: @io::Writer,
for args.each |arg| { for args.each |arg| {
let (crate, filename) = let (crate, filename) =
if arg.ends_with(".rs") || arg.ends_with(".rc") { if arg.ends_with(".rs") || arg.ends_with(".rc") {
(arg.substr(0, arg.len() - 3).to_owned(), copy *arg) (arg.slice_to(arg.len() - 3).to_owned(), copy *arg)
} else { } else {
(copy *arg, arg + ".rs") (copy *arg, arg + ".rs")
}; };
@ -342,7 +342,8 @@ pub fn run_line(repl: &mut Repl, in: @io::Reader, out: @io::Writer, line: ~str,
// FIXME #5898: conflicts with Cell.take(), so can't be at the top level // FIXME #5898: conflicts with Cell.take(), so can't be at the top level
use core::iterator::IteratorUtil; use core::iterator::IteratorUtil;
let full = line.substr(1, line.len() - 1); // drop the : and the \n (one byte each)
let full = line.slice(1, line.len() - 1);
let split: ~[~str] = full.word_iter().transform(|s| s.to_owned()).collect(); let split: ~[~str] = full.word_iter().transform(|s| s.to_owned()).collect();
let len = split.len(); let len = split.len();

View File

@ -848,15 +848,6 @@ fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool {
return true; return true;
} }
/*
Section: String properties
*/
/// Returns the number of characters that a string holds
#[inline(always)]
pub fn char_len(s: &str) -> uint { count_chars(s, 0u, s.len()) }
/* /*
Section: Misc Section: Misc
*/ */
@ -974,46 +965,6 @@ pub fn with_capacity(capacity: uint) -> ~str {
buf buf
} }
/**
* As char_len but for a slice of a string
*
* # Arguments
*
* * s - A valid string
* * start - The position inside `s` where to start counting in bytes
* * end - The position where to stop counting
*
* # Return value
*
* The number of Unicode characters in `s` between the given indices.
*/
pub fn count_chars(s: &str, start: uint, end: uint) -> uint {
assert!(s.is_char_boundary(start));
assert!(s.is_char_boundary(end));
let mut (i, len) = (start, 0u);
while i < end {
let next = s.char_range_at(i).next;
len += 1u;
i = next;
}
return len;
}
/// Counts the number of bytes taken by the first `n` chars in `s`
/// starting from `start`.
pub fn count_bytes<'b>(s: &'b str, start: uint, n: uint) -> uint {
assert!(s.is_char_boundary(start));
let mut (end, cnt) = (start, n);
let l = s.len();
while cnt > 0u {
assert!(end < l);
let next = s.char_range_at(end).next;
cnt -= 1u;
end = next;
}
end - start
}
/// Given a first byte, determine how many bytes are in this UTF-8 character /// Given a first byte, determine how many bytes are in this UTF-8 character
pub fn utf8_char_width(b: u8) -> uint { pub fn utf8_char_width(b: u8) -> uint {
let byte: uint = b as uint; let byte: uint = b as uint;
@ -1394,11 +1345,14 @@ pub trait StrSlice<'self> {
fn is_alphanumeric(&self) -> bool; fn is_alphanumeric(&self) -> bool;
fn len(&self) -> uint; fn len(&self) -> uint;
fn char_len(&self) -> uint; fn char_len(&self) -> uint;
fn slice(&self, begin: uint, end: uint) -> &'self str; fn slice(&self, begin: uint, end: uint) -> &'self str;
fn slice_from(&self, begin: uint) -> &'self str; fn slice_from(&self, begin: uint) -> &'self str;
fn slice_to(&self, end: uint) -> &'self str; fn slice_to(&self, end: uint) -> &'self str;
fn slice_chars(&self, begin: uint, end: uint) -> &'self str;
fn starts_with(&self, needle: &str) -> bool; fn starts_with(&self, needle: &str) -> bool;
fn substr(&self, begin: uint, n: uint) -> &'self str;
fn escape_default(&self) -> ~str; fn escape_default(&self) -> ~str;
fn escape_unicode(&self) -> ~str; fn escape_unicode(&self) -> ~str;
fn trim(&self) -> &'self str; fn trim(&self) -> &'self str;
@ -1595,7 +1549,8 @@ impl<'self> StrSlice<'self> for &'self str {
} }
/// Returns the number of characters that a string holds /// Returns the number of characters that a string holds
#[inline] #[inline]
fn char_len(&self) -> uint { char_len(*self) } fn char_len(&self) -> uint { self.iter().count() }
/** /**
* Returns a slice of the given string from the byte range * Returns a slice of the given string from the byte range
* [`begin`..`end`) * [`begin`..`end`)
@ -1626,6 +1581,32 @@ impl<'self> StrSlice<'self> for &'self str {
fn slice_to(&self, end: uint) -> &'self str { fn slice_to(&self, end: uint) -> &'self str {
self.slice(0, end) self.slice(0, end)
} }
/// Returns a slice of the string from the char range
/// [`begin`..`end`).
///
/// Fails if `begin` > `end` or the either `begin` or `end` are
/// beyond the last character of the string.
fn slice_chars(&self, begin: uint, end: uint) -> &'self str {
assert!(begin <= end);
// not sure how to use the iterators for this nicely.
let mut (position, count) = (0, 0);
let l = self.len();
while count < begin && position < l {
position = self.char_range_at(position).next;
count += 1;
}
if count < begin { fail!("Attempted to begin slice_chars beyond end of string") }
let start_byte = position;
while count < end && position < l {
position = self.char_range_at(position).next;
count += 1;
}
if count < end { fail!("Attempted to end slice_chars beyond end of string") }
self.slice(start_byte, position)
}
/// Returns true if `needle` is a prefix of the string. /// Returns true if `needle` is a prefix of the string.
fn starts_with<'a>(&self, needle: &'a str) -> bool { fn starts_with<'a>(&self, needle: &'a str) -> bool {
let (self_len, needle_len) = (self.len(), needle.len()); let (self_len, needle_len) = (self.len(), needle.len());
@ -1641,16 +1622,6 @@ impl<'self> StrSlice<'self> for &'self str {
else { match_at(*self, needle, self_len - needle_len) } else { match_at(*self, needle, self_len - needle_len) }
} }
/**
* Take a substring of another.
*
* Returns a string containing `n` characters starting at byte offset
* `begin`.
*/
#[inline]
fn substr(&self, begin: uint, n: uint) -> &'self str {
self.slice(begin, begin + count_bytes(*self, begin, n))
}
/// Escape each char in `s` with char::escape_default. /// Escape each char in `s` with char::escape_default.
#[inline] #[inline]
fn escape_default(&self) -> ~str { escape_default(*self) } fn escape_default(&self) -> ~str { escape_default(*self) }
@ -2367,14 +2338,14 @@ mod tests {
assert_eq!("\u2620".len(), 3u); assert_eq!("\u2620".len(), 3u);
assert_eq!("\U0001d11e".len(), 4u); assert_eq!("\U0001d11e".len(), 4u);
assert_eq!(char_len(""), 0u); assert_eq!("".char_len(), 0u);
assert_eq!(char_len("hello world"), 11u); assert_eq!("hello world".char_len(), 11u);
assert_eq!(char_len("\x63"), 1u); assert_eq!("\x63".char_len(), 1u);
assert_eq!(char_len("\xa2"), 1u); assert_eq!("\xa2".char_len(), 1u);
assert_eq!(char_len("\u03c0"), 1u); assert_eq!("\u03c0".char_len(), 1u);
assert_eq!(char_len("\u2620"), 1u); assert_eq!("\u2620".char_len(), 1u);
assert_eq!(char_len("\U0001d11e"), 1u); assert_eq!("\U0001d11e".char_len(), 1u);
assert_eq!(char_len("ประเทศไทย中华Việt Nam"), 19u); assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u);
} }
#[test] #[test]
@ -2509,13 +2480,13 @@ mod tests {
} }
#[test] #[test]
fn test_substr() { fn test_slice_chars() {
fn t(a: &str, b: &str, start: int) { fn t(a: &str, b: &str, start: uint) {
assert_eq!(a.substr(start as uint, b.len()), b); assert_eq!(a.slice_chars(start, start + b.char_len()), b);
} }
t("hello", "llo", 2); t("hello", "llo", 2);
t("hello", "el", 1); t("hello", "el", 1);
assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".substr(6u, 6u)); assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8));
} }
#[test] #[test]

View File

@ -325,7 +325,7 @@ pub mod ct {
'o' => TyOctal, 'o' => TyOctal,
'f' => TyFloat, 'f' => TyFloat,
'?' => TyPoly, '?' => TyPoly,
_ => err(~"unknown type in conversion: " + s.substr(i, 1)) _ => err(fmt!("unknown type in conversion: %c", s.char_at(i)))
}; };
Parsed::new(t, i + 1) Parsed::new(t, i + 1)
@ -546,7 +546,7 @@ pub mod rt {
// displayed // displayed
let unpadded = match cv.precision { let unpadded = match cv.precision {
CountImplied => s, CountImplied => s,
CountIs(max) => if (max as uint) < str::char_len(s) { CountIs(max) => if (max as uint) < s.char_len() {
s.slice(0, max as uint) s.slice(0, max as uint)
} else { } else {
s s
@ -584,7 +584,7 @@ pub mod rt {
~"" ~""
} else { } else {
let s = uint::to_str_radix(num, radix); let s = uint::to_str_radix(num, radix);
let len = str::char_len(s); let len = s.char_len();
if len < prec { if len < prec {
let diff = prec - len; let diff = prec - len;
let pad = str::from_chars(vec::from_elem(diff, '0')); let pad = str::from_chars(vec::from_elem(diff, '0'));
@ -614,7 +614,7 @@ pub mod rt {
} }
CountIs(width) => { width as uint } CountIs(width) => { width as uint }
}; };
let strlen = str::char_len(s) + headsize; let strlen = s.char_len() + headsize;
if uwidth <= strlen { if uwidth <= strlen {
for head.iter().advance |&c| { for head.iter().advance |&c| {
buf.push_char(c); buf.push_char(c);

View File

@ -21,7 +21,7 @@ pub fn main() {
let schs: ~[char] = s.iter().collect(); let schs: ~[char] = s.iter().collect();
assert!(s.len() == 10u); assert!(s.len() == 10u);
assert!(str::char_len(s) == 4u); assert!(s.char_len() == 4u);
assert!(schs.len() == 4u); assert!(schs.len() == 4u);
assert!(str::from_chars(schs) == s); assert!(str::from_chars(schs) == s);
assert!(s.char_at(0u) == 'e'); assert!(s.char_at(0u) == 'e');