Merge pull request #1812 from killerswan/indexing2

(core::str) Fixing index and rindex
2012-02-11 17:42:45 -08:00 · 2012-02-11 17:42:45 -08:00 · 737db5b49a
commit 737db5b49a
parent c82a0d7c3c 207bb3d2df
7 changed files with 150 additions and 86 deletions
--- a/src/cargo/cargo.rs
+++ b/src/cargo/cargo.rs
@ -651,25 +651,27 @@ fn cmd_install(c: cargo) unsafe {

    if str::starts_with(target, "uuid:") {
        let uuid = rest(target, 5u);
-        let idx = str::index(uuid, '/' as u8);
-        if idx != -1 {
-            let source = str::unsafe::slice_bytes(uuid, 0u, idx as uint);
-            uuid = str::unsafe::slice_bytes(uuid, idx as uint + 1u,
-                                      str::byte_len(uuid));
-            install_uuid_specific(c, wd, source, uuid);
-        } else {
-            install_uuid(c, wd, uuid);
+        alt str::index(uuid, '/') {
+            option::some(idx) {
+               let source = str::slice(uuid, 0u, idx);
+               uuid = str::slice(uuid, idx + 1u, str::char_len(uuid));
+               install_uuid_specific(c, wd, source, uuid);
+            }
+            option::none {
+               install_uuid(c, wd, uuid);
+            }
        }
    } else {
        let name = target;
-        let idx = str::index(name, '/' as u8);
-        if idx != -1 {
-            let source = str::unsafe::slice_bytes(name, 0u, idx as uint);
-            name = str::unsafe::slice_bytes(name, idx as uint + 1u,
-                                      str::byte_len(name));
-            install_named_specific(c, wd, source, name);
-        } else {
-            install_named(c, wd, name);
+        alt str::index(name, '/') {
+            option::some(idx) {
+               let source = str::slice(name, 0u, idx);
+               name = str::slice(name, idx + 1u, str::char_len(name));
+               install_named_specific(c, wd, source, name);
+            }
+            option::none {
+               install_named(c, wd, name);
+            }
        }
    }
 }
--- a/src/comp/back/link.rs
+++ b/src/comp/back/link.rs
@ -109,14 +109,16 @@ mod write {
    // Decides what to call an intermediate file, given the name of the output
    // and the extension to use.
    fn mk_intermediate_name(output_path: str, extension: str) -> str unsafe {
-        let dot_pos = str::index(output_path, '.' as u8);
-        let stem;
-        if dot_pos < 0 {
-            stem = output_path;
-        } else { stem = str::unsafe::slice_bytes(output_path, 0u,
-                                                 dot_pos as uint); }
+        let stem = alt str::index(output_path, '.') {
+                       option::some(dot_pos) {
+                           str::slice(output_path, 0u, dot_pos)
+                       }
+                       option::none { output_path }
+                   };
+
        ret stem + "." + extension;
    }
+
    fn run_passes(sess: session, llmod: ModuleRef, output: str) {
        let opts = sess.opts;
        if opts.time_llvm_passes { llvm::LLVMRustEnableTimePasses(); }
--- a/src/comp/syntax/codemap.rs
+++ b/src/comp/syntax/codemap.rs
@ -119,16 +119,13 @@ fn get_line(fm: filemap, line: int) -> str unsafe {
    let end: uint;
    if line as uint < vec::len(fm.lines) - 1u {
        end = fm.lines[line + 1].byte - fm.start_pos.byte;
+        ret str::unsafe::slice_bytes(*fm.src, begin, end);
    } else {
        // If we're not done parsing the file, we're at the limit of what's
        // parsed. If we just slice the rest of the string, we'll print out
        // the remainder of the file, which is undesirable.
-        end = str::byte_len(*fm.src);
-        let rest = str::unsafe::slice_bytes(*fm.src, begin, end);
-        let newline = str::index(rest, '\n' as u8);
-        if newline != -1 { end = begin + (newline as uint); }
+        ret str::splitn_char(*fm.src, '\n', 1u)[0];
    }
-    ret str::unsafe::slice_bytes(*fm.src, begin, end);
 }

 fn lookup_byte_offset(cm: codemap::codemap, chpos: uint)
--- a/src/fuzzer/fuzzer.rs
+++ b/src/fuzzer/fuzzer.rs
@ -283,10 +283,9 @@ fn check_variants_T<T: copy>(
    }
 }

-fn last_part(filename: str) -> str unsafe {
-  let ix = str::rindex(filename, 47u8 /* '/' */);
-  assert ix >= 0;
-  str::unsafe::slice_bytes(filename, ix as uint + 1u, str::byte_len(filename) - 3u)
+fn last_part(filename: str) -> str {
+  let ix = option::get(str::rindex(filename, '/'));
+  str::slice(filename, ix + 1u, str::char_len(filename) - 3u)
 }

 enum happiness { passed, cleanly_rejected(str), known_bug(str), failed(str), }
--- a/src/libcore/str.rs
+++ b/src/libcore/str.rs
@ -253,15 +253,12 @@ Function: pop_char
 Remove the final character from a string and return it.

 Failure:
-
 If the string does not contain any characters.
 */
 fn pop_char(&s: str) -> char unsafe {
    let end = byte_len(s);
-    while end > 0u && s[end - 1u] & 192u8 == tag_cont_u8 { end -= 1u; }
-    assert (end > 0u);
-    let ch = char_at(s, end - 1u);
-    s = unsafe::slice_bytes(s, 0u, end - 1u);
+    let {ch:ch, prev:end} = char_range_at_reverse(s, end);
+    s = unsafe::slice_bytes(s, 0u, end);
    ret ch;
 }

@ -868,32 +865,50 @@ fn lines_iter(ss: str, ff: fn(&&str)) {
 Section: Searching
 */

-/*
-Function: index
+// Function: index
+//
+// Returns the index of the first matching char
+// (as option some/none)
+fn index(ss: str, cc: char) -> option<uint> {
+    let bii = 0u;
+    let cii = 0u;
+    let len = byte_len(ss);
+    while bii < len {
+        let {ch, next} = char_range_at(ss, bii);

-Returns the index of the first matching byte. Returns -1 if
-no match is found.
+        // found here?
+        if ch == cc {
+            ret option::some(cii);
+        }

-FIXME: UTF-8
-*/
-fn index(s: str, c: u8) -> int {
-    let i: int = 0;
-    for k: u8 in s { if k == c { ret i; } i += 1; }
-    ret -1;
+        cii += 1u;
+        bii = next;
+    }
+
+    // wasn't found
+    ret option::none;
 }

-/*
-Function: rindex
+// Function: rindex
+//
+// Returns the index of the first matching char
+// (as option some/none)
+fn rindex(ss: str, cc: char) -> option<uint> {
+    let bii = byte_len(ss);
+    let cii = char_len(ss);
+    while bii > 0u {
+        let {ch, prev} = char_range_at_reverse(ss, bii);
+        cii -= 1u;
+        bii = prev;

-Returns the index of the last matching byte. Returns -1
-if no match is found.
+        // found here?
+        if ch == cc {
+            ret option::some(cii);
+        }
+    }

-FIXME: UTF-8
-*/
-fn rindex(s: str, c: u8) -> int {
-    let n: int = byte_len(s) as int;
-    while n >= 0 { if s[n] == c { ret n; } n -= 1; }
-    ret n;
+    // wasn't found
+    ret option::none;
 }

 /*
@ -1233,6 +1248,25 @@ Pluck a character out of a string
 */
 fn char_at(s: str, i: uint) -> char { ret char_range_at(s, i).ch; }

+// Function: char_range_at_reverse
+//
+// Given a byte position and a str, return the previous char and its position
+// This function can be used to iterate over a unicode string in reverse.
+fn char_range_at_reverse(ss: str, start: uint) -> {ch: char, prev: uint} {
+    let prev = start;
+
+    // while there is a previous byte == 10......
+    while prev > 0u && ss[prev - 1u] & 192u8 == tag_cont_u8 {
+        prev -= 1u;
+    }
+
+    // now refer to the initial byte of previous char
+    prev -= 1u;
+
+    let ch = char_at(ss, prev);
+    ret {ch:ch, prev:prev};
+}
+
 /*
 Function: substr_all

@ -1442,13 +1476,42 @@ mod tests {
    }

    #[test]
-    fn test_index_and_rindex() {
-        assert (index("hello", 'e' as u8) == 1);
-        assert (index("hello", 'o' as u8) == 4);
-        assert (index("hello", 'z' as u8) == -1);
-        assert (rindex("hello", 'l' as u8) == 3);
-        assert (rindex("hello", 'h' as u8) == 0);
-        assert (rindex("hello", 'z' as u8) == -1);
+    fn test_index() {
+        assert ( index("hello", 'h') == option::some(0u));
+        assert ( index("hello", 'e') == option::some(1u));
+        assert ( index("hello", 'o') == option::some(4u));
+        assert ( index("hello", 'z') == option::none);
+    }
+
+    #[test]
+    fn test_rindex() {
+        assert (rindex("hello", 'l') == option::some(3u));
+        assert (rindex("hello", 'o') == option::some(4u));
+        assert (rindex("hello", 'h') == option::some(0u));
+        assert (rindex("hello", 'z') == option::none);
+    }
+
+    #[test]
+    fn test_pop_char() {
+        let data = "ประเทศไทย中华";
+        let cc = pop_char(data);
+        assert "ประเทศไทย中" == data;
+        assert '华' == cc;
+    }
+
+    #[test]
+    fn test_pop_char_2() {
+        let data2 = "华";
+        let cc2 = pop_char(data2);
+        assert "" == data2;
+        assert '华' == cc2;
+    }
+
+    #[test]
+    #[should_fail]
+    fn test_pop_char_fail() {
+        let data = "";
+        let _cc3 = pop_char(data);
    }

    #[test]
--- a/src/libstd/fs.rs
+++ b/src/libstd/fs.rs
@ -32,6 +32,22 @@ A path or fragment of a filesystem path
 */
 type path = str;

+fn splitDirnameBasename (pp: path) -> {dirname: str, basename: str} {
+    let ii;
+    alt str::rindex(pp, os_fs::path_sep) {
+        option::some(xx) { ii = xx; }
+        option::none {
+            alt str::rindex(pp, os_fs::alt_path_sep) {
+                option::some(xx) { ii = xx; }
+                option::none { ret {dirname: ".", basename: pp}; }
+            }
+        }
+    }
+
+    ret {dirname: str::slice(pp, 0u, ii),
+         basename: str::slice(pp, ii + 1u, str::char_len(pp))};
+}
+
 /*
 Function: dirname

@ -43,13 +59,8 @@ The dirname of "/usr/share" will be "/usr", but the dirname of

 If the path is not prefixed with a directory, then "." is returned.
 */
-fn dirname(p: path) -> path unsafe {
-    let i: int = str::rindex(p, os_fs::path_sep as u8);
-    if i == -1 {
-        i = str::rindex(p, os_fs::alt_path_sep as u8);
-        if i == -1 { ret "."; }
-    }
-    ret str::unsafe::slice_bytes(p, 0u, i as uint);
+fn dirname(pp: path) -> path {
+    ret splitDirnameBasename(pp).dirname;
 }

 /*
@ -63,18 +74,10 @@ path separators in the path then the returned path is identical to
 the provided path. If an empty path is provided or the path ends
 with a path separator then an empty path is returned.
 */
-fn basename(p: path) -> path unsafe {
-    let i: int = str::rindex(p, os_fs::path_sep as u8);
-    if i == -1 {
-        i = str::rindex(p, os_fs::alt_path_sep as u8);
-        if i == -1 { ret p; }
-    }
-    let len = str::byte_len(p);
-    if (i + 1) as uint >= len { ret p; }
-    ret str::unsafe::slice_bytes(p, (i + 1) as uint, len);
+fn basename(pp: path) -> path {
+    ret splitDirnameBasename(pp).basename;
 }

-
 // FIXME: Need some typestate to avoid bounds check when len(pre) == 0
 /*
 Function: connect
--- a/src/libstd/getopts.rs
+++ b/src/libstd/getopts.rs
@ -230,16 +230,14 @@ fn getopts(args: [str], opts: [opt]) -> result unsafe {
            let i_arg = option::none::<str>;
            if cur[1] == '-' as u8 {
                let tail = str::unsafe::slice_bytes(cur, 2u, curlen);
-                let eq = str::index(tail, '=' as u8);
-                if eq == -1 {
+                let tail_eq = str::splitn_char(tail, '=', 1u);
+                if vec::len(tail_eq) <= 1u {
                    names = [long(tail)];
                } else {
                    names =
-                        [long(str::unsafe::slice_bytes(tail,0u,eq as uint))];
+                        [long(tail_eq[0])];
                    i_arg =
-                        option::some::<str>(str::unsafe::slice_bytes(tail,
-                                                       (eq as uint) + 1u,
-                                                       curlen - 2u));
+                        option::some::<str>(tail_eq[1]);
                }
            } else {
                let j = 1u;