Add std::istr. Issue #855

This commit is contained in:
Brian Anderson 2011-08-22 18:06:44 -07:00
parent 55c54f0db5
commit 663d07d319
5 changed files with 712 additions and 0 deletions

428
src/lib/istr.rs Normal file
View File

@ -0,0 +1,428 @@
export eq, lteq, hash, is_empty, is_not_empty, is_whitespace, byte_len,
index, rindex, find, starts_with, ends_with, substr, slice, split,
concat, connect, to_upper, replace, char_slice, trim_left, trim_right, trim,
unshift_char, shift_char, pop_char, push_char, is_utf8, from_chars, to_chars,
char_len, char_at, bytes, is_ascii, shift_byte, pop_byte;
fn eq(a: &istr, b: &istr) -> bool { a == b }
fn lteq(a: &istr, b: &istr) -> bool { a <= b }
fn hash(s: &istr) -> uint {
// djb hash.
// FIXME: replace with murmur.
let u: uint = 5381u;
for c: u8 in s { u *= 33u; u += c as uint; }
ret u;
}
// UTF-8 tags and ranges
const tag_cont_u8: u8 = 128u8;
const tag_cont: uint = 128u;
const max_one_b: uint = 128u;
const tag_two_b: uint = 192u;
const max_two_b: uint = 2048u;
const tag_three_b: uint = 224u;
const max_three_b: uint = 65536u;
const tag_four_b: uint = 240u;
const max_four_b: uint = 2097152u;
const tag_five_b: uint = 248u;
const max_five_b: uint = 67108864u;
const tag_six_b: uint = 252u;
fn is_utf8(v: &[u8]) -> bool {
let i = 0u;
let total = vec::len::<u8>(v);
while i < total {
let chsize = utf8_char_width(v[i]);
if chsize == 0u { ret false; }
if i + chsize > total { ret false; }
i += 1u;
while chsize > 1u {
if v[i] & 192u8 != tag_cont_u8 { ret false; }
i += 1u;
chsize -= 1u;
}
}
ret true;
}
fn is_ascii(s: &istr) -> bool {
let i: uint = byte_len(s);
while i > 0u { i -= 1u; if s[i] & 128u8 != 0u8 { ret false; } }
ret true;
}
/// Returns true if the string has length 0
pred is_empty(s: &istr) -> bool {
for c: u8 in s { ret false; } ret true;
}
/// Returns true if the string has length greater than 0
pred is_not_empty(s: &istr) -> bool {
!is_empty(s)
}
fn is_whitespace(s: &istr) -> bool {
let i = 0u;
let len = char_len(s);
while i < len {
if !char::is_whitespace(char_at(s, i)) { ret false; }
i += 1u
}
ret true;
}
fn byte_len(s: &istr) -> uint {
let v: [u8] = unsafe::reinterpret_cast(s);
let vlen = vec::len(v);
unsafe::leak(v);
// There should always be a null terminator
assert vlen > 0u;
ret vlen - 1u;
}
fn bytes(s: &istr) -> [u8] {
let v = unsafe::reinterpret_cast(s);
let vcopy = vec::slice(v, 0u, vec::len(v) - 1u);
unsafe::leak(v);
ret vcopy;
}
fn unsafe_from_bytes(v: &[mutable? u8]) -> istr {
let vcopy: [u8] = v + [0u8];
let scopy: istr = unsafe::reinterpret_cast(vcopy);
ret scopy;
}
fn unsafe_from_byte(u: u8) -> istr {
unsafe_from_bytes([u])
}
fn push_utf8_bytes(s: &mutable istr, ch: char) {
let code = ch as uint;
let bytes = if code < max_one_b {
[code as u8]
} else if code < max_two_b {
[(code >> 6u & 31u | tag_two_b) as u8,
(code & 63u | tag_cont) as u8]
} else if code < max_three_b {
[(code >> 12u & 15u | tag_three_b) as u8,
(code >> 6u & 63u | tag_cont) as u8,
(code & 63u | tag_cont) as u8]
} else if code < max_four_b {
[(code >> 18u & 7u | tag_four_b) as u8,
(code >> 12u & 63u | tag_cont) as u8,
(code >> 6u & 63u | tag_cont) as u8,
(code & 63u | tag_cont) as u8]
} else if code < max_five_b {
[(code >> 24u & 3u | tag_five_b) as u8,
(code >> 18u & 63u | tag_cont) as u8,
(code >> 12u & 63u | tag_cont) as u8,
(code >> 6u & 63u | tag_cont) as u8,
(code & 63u | tag_cont) as u8]
} else {
[(code >> 30u & 1u | tag_six_b) as u8,
(code >> 24u & 63u | tag_cont) as u8,
(code >> 18u & 63u | tag_cont) as u8,
(code >> 12u & 63u | tag_cont) as u8,
(code >> 6u & 63u | tag_cont) as u8,
(code & 63u | tag_cont) as u8]
};
push_bytes(s, bytes);
}
fn from_char(ch: char) -> istr {
let buf = ~"";
push_utf8_bytes(buf, ch);
ret buf;
}
fn from_chars(chs: &[char]) -> istr {
let buf = ~"";
for ch: char in chs { push_utf8_bytes(buf, ch); }
ret buf;
}
fn utf8_char_width(b: u8) -> uint {
let byte: uint = b as uint;
if byte < 128u { ret 1u; }
if byte < 192u {
ret 0u; // Not a valid start byte
}
if byte < 224u { ret 2u; }
if byte < 240u { ret 3u; }
if byte < 248u { ret 4u; }
if byte < 252u { ret 5u; }
ret 6u;
}
fn char_range_at(s: &istr, i: uint) -> {ch: char, next: uint} {
let b0 = s[i];
let w = utf8_char_width(b0);
assert (w != 0u);
if w == 1u { ret {ch: b0 as char, next: i + 1u}; }
let val = 0u;
let end = i + w;
i += 1u;
while i < end {
let byte = s[i];
assert (byte & 192u8 == tag_cont_u8);
val <<= 6u;
val += byte & 63u8 as uint;
i += 1u;
}
// Clunky way to get the right bits from the first byte. Uses two shifts,
// the first to clip off the marker bits at the left of the byte, and then
// a second (as uint) to get it to the right position.
val += (b0 << (w + 1u as u8) as uint) << (w - 1u) * 6u - w - 1u;
ret {ch: val as char, next: i};
}
fn char_at(s: &istr, i: uint) -> char { ret char_range_at(s, i).ch; }
fn char_len(s: &istr) -> uint {
let i = 0u;
let len = 0u;
let total = byte_len(s);
while i < total {
let chsize = utf8_char_width(s[i]);
assert (chsize > 0u);
len += 1u;
i += chsize;
}
assert (i == total);
ret len;
}
fn to_chars(s: &istr) -> [char] {
let buf: [char] = [];
let i = 0u;
let len = byte_len(s);
while i < len {
let cur = char_range_at(s, i);
buf += [cur.ch];
i = cur.next;
}
ret buf;
}
fn push_char(s: &mutable istr, ch: char) { s += from_char(ch); }
fn pop_char(s: &mutable istr) -> char {
let end = byte_len(s);
while end > 0u && s[end - 1u] & 192u8 == tag_cont_u8 { end -= 1u; }
assert (end > 0u);
let ch = char_at(s, end - 1u);
s = substr(s, 0u, end - 1u);
ret ch;
}
fn shift_char(s: &mutable istr) -> char {
let r = char_range_at(s, 0u);
s = substr(s, r.next, byte_len(s) - r.next);
ret r.ch;
}
fn unshift_char(s: &mutable istr, ch: char) { s = from_char(ch) + s; }
fn index(s: &istr, c: u8) -> int {
let i: int = 0;
for k: u8 in s { if k == c { ret i; } i += 1; }
ret -1;
}
fn rindex(s: &istr, c: u8) -> int {
let n: int = byte_len(s) as int;
while n >= 0 { if s[n] == c { ret n; } n -= 1; }
ret n;
}
fn find(haystack: &istr, needle: &istr) -> int {
let haystack_len: int = byte_len(haystack) as int;
let needle_len: int = byte_len(needle) as int;
if needle_len == 0 { ret 0; }
fn match_at(haystack: &istr, needle: &istr, i: int) -> bool {
let j: int = i;
for c: u8 in needle { if haystack[j] != c { ret false; } j += 1; }
ret true;
}
let i: int = 0;
while i <= haystack_len - needle_len {
if match_at(haystack, needle, i) { ret i; }
i += 1;
}
ret -1;
}
fn starts_with(haystack: &istr, needle: &istr) -> bool {
let haystack_len: uint = byte_len(haystack);
let needle_len: uint = byte_len(needle);
if needle_len == 0u { ret true; }
if needle_len > haystack_len { ret false; }
ret eq(substr(haystack, 0u, needle_len), needle);
}
fn ends_with(haystack: &istr, needle: &istr) -> bool {
let haystack_len: uint = byte_len(haystack);
let needle_len: uint = byte_len(needle);
ret if needle_len == 0u {
true
} else if needle_len > haystack_len {
false
} else {
eq(substr(haystack, haystack_len - needle_len, needle_len),
needle)
};
}
fn substr(s: &istr, begin: uint, len: uint) -> istr {
ret slice(s, begin, begin + len);
}
fn slice(s: &istr, begin: uint, end: uint) -> istr {
// FIXME: Typestate precondition
assert (begin <= end);
assert (end <= byte_len(s));
let v: [u8] = unsafe::reinterpret_cast(s);
let v2 = vec::slice(v, begin, end);
unsafe::leak(v);
v2 += [0u8];
let s2: istr = unsafe::reinterpret_cast(v2);
unsafe::leak(v2);
ret s2;
}
fn safe_slice(s: &istr, begin: uint, end: uint)
: uint::le(begin, end) -> istr {
// would need some magic to make this a precondition
assert (end <= byte_len(s));
ret slice(s, begin, end);
}
fn shift_byte(s: &mutable istr) -> u8 {
let len = byte_len(s);
assert (len > 0u);
let b = s[0];
s = substr(s, 1u, len - 1u);
ret b;
}
fn pop_byte(s: &mutable istr) -> u8 {
let len = byte_len(s);
assert (len > 0u);
let b = s[len - 1u];
s = substr(s, 0u, len - 1u);
ret b;
}
fn push_byte(s: &mutable istr, b: u8) {
s += unsafe_from_byte(b);
}
fn push_bytes(s: &mutable istr, bytes: &[u8]) {
for byte in bytes {
push_byte(s, byte);
}
}
fn split(s: &istr, sep: u8) -> [istr] {
let v: [istr] = [];
let accum: istr = ~"";
let ends_with_sep: bool = false;
for c: u8 in s {
if c == sep {
v += [accum];
accum = ~"";
ends_with_sep = true;
} else { accum += unsafe_from_byte(c); ends_with_sep = false; }
}
if byte_len(accum) != 0u || ends_with_sep { v += [accum]; }
ret v;
}
fn concat(v: &[istr]) -> istr {
let s: istr = ~"";
for ss: istr in v { s += ss; }
ret s;
}
fn connect(v: &[istr], sep: &istr) -> istr {
let s: istr = ~"";
let first: bool = true;
for ss: istr in v {
if first { first = false; } else { s += sep; }
s += ss;
}
ret s;
}
// FIXME: This only handles ASCII
fn to_upper(s: &istr) -> istr {
let outstr = ~"";
let ascii_a = 'a' as u8;
let ascii_z = 'z' as u8;
let diff = 32u8;
for byte: u8 in s {
let next;
if ascii_a <= byte && byte <= ascii_z {
next = byte - diff;
} else { next = byte; }
push_byte(outstr, next);
}
ret outstr;
}
// FIXME: This is super-inefficient
fn replace(s: &istr, from: &istr, to: &istr) : is_not_empty(from) -> istr {
// FIXME (694): Shouldn't have to check this
check (is_not_empty(from));
if byte_len(s) == 0u {
ret ~"";
} else if starts_with(s, from) {
ret to + replace(slice(s, byte_len(from), byte_len(s)), from, to);
} else {
ret unsafe_from_byte(s[0]) +
replace(slice(s, 1u, byte_len(s)), from, to);
}
}
// FIXME: Also not efficient
fn char_slice(s: &istr, begin: uint, end: uint) -> istr {
from_chars(vec::slice(to_chars(s), begin, end))
}
fn trim_left(s: &istr) -> istr {
fn count_whities(s: &[char]) -> uint {
let i = 0u;
while i < vec::len(s) {
if !char::is_whitespace(s[i]) { break; }
i += 1u;
}
ret i;
}
let chars = to_chars(s);
let whities = count_whities(chars);
ret from_chars(vec::slice(chars, whities, vec::len(chars)));
}
fn trim_right(s: &istr) -> istr {
fn count_whities(s: &[char]) -> uint {
let i = vec::len(s);
while 0u < i {
if !char::is_whitespace(s[i - 1u]) { break; }
i -= 1u;
}
ret i;
}
let chars = to_chars(s);
let whities = count_whities(chars);
ret from_chars(vec::slice(chars, 0u, whities));
}
fn trim(s: &istr) -> istr {
trim_left(trim_right(s))
}

View File

@ -16,6 +16,7 @@ mod u8;
mod u64;
mod vec;
mod str;
mod istr;
// General io and system-services modules.

View File

@ -0,0 +1,31 @@
use std;
import std::istr;
import std::vec;
fn main() {
// Chars of 1, 2, 3, and 4 bytes
let chs: [char] = ['e', 'é', '€', 0x10000 as char];
let s: istr = istr::from_chars(chs);
assert (istr::byte_len(s) == 10u);
assert (istr::char_len(s) == 4u);
assert (vec::len::<char>(istr::to_chars(s)) == 4u);
assert (istr::eq(istr::from_chars(istr::to_chars(s)), s));
assert (istr::char_at(s, 0u) == 'e');
assert (istr::char_at(s, 1u) == 'é');
assert (istr::is_utf8(istr::bytes(s)));
assert (!istr::is_utf8([0x80_u8]));
assert (!istr::is_utf8([0xc0_u8]));
assert (!istr::is_utf8([0xc0_u8, 0x10_u8]));
let stack = ~"a×c€";
assert (istr::pop_char(stack) == '€');
assert (istr::pop_char(stack) == 'c');
istr::push_char(stack, 'u');
assert (istr::eq(stack, ~"a×u"));
assert (istr::shift_char(stack) == 'a');
assert (istr::shift_char(stack) == '×');
istr::unshift_char(stack, 'ß');
assert (istr::eq(stack, ~"ßu"));
}

251
src/test/stdtest/istr.rs Normal file
View File

@ -0,0 +1,251 @@
import std::istr;
#[test]
fn test_eq() {
assert istr::eq(~"", ~"");
assert istr::eq(~"foo", ~"foo");
assert !istr::eq(~"foo", ~"bar");
}
#[test]
fn test_lteq() {
assert istr::lteq(~"", ~"");
assert istr::lteq(~"", ~"foo");
assert istr::lteq(~"foo", ~"foo");
assert !istr::eq(~"foo", ~"bar");
}
#[test]
fn test_bytes_len() {
assert (istr::byte_len(~"") == 0u);
assert (istr::byte_len(~"hello world") == 11u);
assert (istr::byte_len(~"\x63") == 1u);
assert (istr::byte_len(~"\xa2") == 2u);
assert (istr::byte_len(~"\u03c0") == 2u);
assert (istr::byte_len(~"\u2620") == 3u);
assert (istr::byte_len(~"\U0001d11e") == 4u);
}
#[test]
fn test_index_and_rindex() {
assert (istr::index(~"hello", 'e' as u8) == 1);
assert (istr::index(~"hello", 'o' as u8) == 4);
assert (istr::index(~"hello", 'z' as u8) == -1);
assert (istr::rindex(~"hello", 'l' as u8) == 3);
assert (istr::rindex(~"hello", 'h' as u8) == 0);
assert (istr::rindex(~"hello", 'z' as u8) == -1);
}
#[test]
fn test_split() {
fn t(s: &istr, c: char, i: int, k: &istr) {
log ~"splitting: " + s;
log i;
let v = istr::split(s, c as u8);
log ~"split to: ";
for z: istr in v { log z; }
log ~"comparing: " + v[i] + ~" vs. " + k;
assert (istr::eq(v[i], k));
}
t(~"abc.hello.there", '.', 0, ~"abc");
t(~"abc.hello.there", '.', 1, ~"hello");
t(~"abc.hello.there", '.', 2, ~"there");
t(~".hello.there", '.', 0, ~"");
t(~".hello.there", '.', 1, ~"hello");
t(~"...hello.there.", '.', 3, ~"hello");
t(~"...hello.there.", '.', 5, ~"");
}
#[test]
fn test_find() {
fn t(haystack: &istr, needle: &istr, i: int) {
let j: int = istr::find(haystack, needle);
log ~"searched for " + needle;
log j;
assert (i == j);
}
t(~"this is a simple", ~"is a", 5);
t(~"this is a simple", ~"is z", -1);
t(~"this is a simple", ~"", 0);
t(~"this is a simple", ~"simple", 10);
t(~"this", ~"simple", -1);
}
#[test]
fn test_substr() {
fn t(a: &istr, b: &istr, start: int) {
assert (istr::eq(istr::substr(a, start as uint,
istr::byte_len(b)), b));
}
t(~"hello", ~"llo", 2);
t(~"hello", ~"el", 1);
t(~"substr should not be a challenge", ~"not", 14);
}
#[test]
fn test_concat() {
fn t(v: &[istr], s: &istr) { assert (istr::eq(istr::concat(v), s)); }
t([~"you", ~"know", ~"I'm", ~"no", ~"good"], ~"youknowI'mnogood");
let v: [istr] = [];
t(v, ~"");
t([~"hi"], ~"hi");
}
#[test]
fn test_connect() {
fn t(v: &[istr], sep: &istr, s: &istr) {
assert (istr::eq(istr::connect(v, sep), s));
}
t([~"you", ~"know", ~"I'm", ~"no", ~"good"], ~" ",
~"you know I'm no good");
let v: [istr] = [];
t(v, ~" ", ~"");
t([~"hi"], ~" ", ~"hi");
}
#[test]
fn test_to_upper() {
// to_upper doesn't understand unicode yet,
// but we need to at least preserve it
let unicode = ~"\u65e5\u672c";
let input = ~"abcDEF" + unicode + ~"xyz:.;";
let expected = ~"ABCDEF" + unicode + ~"XYZ:.;";
let actual = istr::to_upper(input);
assert (istr::eq(expected, actual));
}
#[test]
fn test_slice() {
assert (istr::eq(~"ab", istr::slice(~"abc", 0u, 2u)));
assert (istr::eq(~"bc", istr::slice(~"abc", 1u, 3u)));
assert (istr::eq(~"", istr::slice(~"abc", 1u, 1u)));
fn a_million_letter_a() -> istr {
let i = 0;
let rs = ~"";
while i < 100000 { rs += ~"aaaaaaaaaa"; i += 1; }
ret rs;
}
fn half_a_million_letter_a() -> istr {
let i = 0;
let rs = ~"";
while i < 100000 { rs += ~"aaaaa"; i += 1; }
ret rs;
}
assert (istr::eq(half_a_million_letter_a(),
istr::slice(a_million_letter_a(), 0u, 500000u)));
}
#[test]
fn test_starts_with() {
assert (istr::starts_with(~"", ~""));
assert (istr::starts_with(~"abc", ~""));
assert (istr::starts_with(~"abc", ~"a"));
assert (!istr::starts_with(~"a", ~"abc"));
assert (!istr::starts_with(~"", ~"abc"));
}
#[test]
fn test_ends_with() {
assert (istr::ends_with(~"", ~""));
assert (istr::ends_with(~"abc", ~""));
assert (istr::ends_with(~"abc", ~"c"));
assert (!istr::ends_with(~"a", ~"abc"));
assert (!istr::ends_with(~"", ~"abc"));
}
#[test]
fn test_is_empty() {
assert (istr::is_empty(~""));
assert (!istr::is_empty(~"a"));
}
#[test]
fn test_is_not_empty() {
assert (istr::is_not_empty(~"a"));
assert (!istr::is_not_empty(~""));
}
#[test]
fn test_replace() {
let a = ~"a";
check (istr::is_not_empty(a));
assert (istr::replace(~"", a, ~"b") == ~"");
assert (istr::replace(~"a", a, ~"b") == ~"b");
assert (istr::replace(~"ab", a, ~"b") == ~"bb");
let test = ~"test";
check (istr::is_not_empty(test));
assert (istr::replace(~" test test ", test, ~"toast")
== ~" toast toast ");
assert (istr::replace(~" test test ", test, ~"") == ~" ");
}
#[test]
fn test_char_slice() {
assert (istr::eq(~"ab", istr::char_slice(~"abc", 0u, 2u)));
assert (istr::eq(~"bc", istr::char_slice(~"abc", 1u, 3u)));
assert (istr::eq(~"", istr::char_slice(~"abc", 1u, 1u)));
assert (istr::eq(~"\u65e5", istr::char_slice(~"\u65e5\u672c", 0u, 1u)));
}
#[test]
fn trim_left() {
assert (istr::trim_left(~"") == ~"");
assert (istr::trim_left(~"a") == ~"a");
assert (istr::trim_left(~" ") == ~"");
assert (istr::trim_left(~" blah") == ~"blah");
assert (istr::trim_left(~" \u3000 wut") == ~"wut");
assert (istr::trim_left(~"hey ") == ~"hey ");
}
#[test]
fn trim_right() {
assert (istr::trim_right(~"") == ~"");
assert (istr::trim_right(~"a") == ~"a");
assert (istr::trim_right(~" ") == ~"");
assert (istr::trim_right(~"blah ") == ~"blah");
assert (istr::trim_right(~"wut \u3000 ") == ~"wut");
assert (istr::trim_right(~" hey") == ~" hey");
}
#[test]
fn trim() {
assert (istr::trim(~"") == ~"");
assert (istr::trim(~"a") == ~"a");
assert (istr::trim(~" ") == ~"");
assert (istr::trim(~" blah ") == ~"blah");
assert (istr::trim(~"\nwut \u3000 ") == ~"wut");
assert (istr::trim(~" hey dude ") == ~"hey dude");
}
#[test]
fn is_whitespace() {
assert (istr::is_whitespace(~""));
assert (istr::is_whitespace(~" "));
assert (istr::is_whitespace(~"\u2009")); // Thin space
assert (istr::is_whitespace(~" \n\t "));
assert (!istr::is_whitespace(~" _ "));
}
#[test]
fn is_ascii() {
assert istr::is_ascii(~"");
assert istr::is_ascii(~"a");
assert !istr::is_ascii(~"\u2009");
}
#[test]
fn shift_byte() {
let s = ~"ABC";
let b = istr::shift_byte(s);
assert s == ~"BC";
assert b == 65u8;
}
#[test]
fn pop_byte() {
let s = ~"ABC";
let b = istr::pop_byte(s);
assert s == ~"AB";
assert b == 67u8;
}

View File

@ -25,6 +25,7 @@ mod sha1;
mod sort;
mod str_buf;
mod str;
mod istr;
mod task;
mod test;
mod uint;