std: Stabilize the Utf8Error type

The meaning of each variant of this enum was somewhat ambiguous and it's uncler that we wouldn't even want to add more enumeration values in the future. As a result this error has been altered to instead become an opaque structure. Learning about the "first invalid byte index" is still an unstable feature, but the type itself is now stable.
2015-04-10 16:05:09 -07:00 · 2015-04-10 16:05:09 -07:00 · f329030b09
commit f329030b09
parent c897ac04e2
6 changed files with 26 additions and 38 deletions
--- a/src/libcollections/lib.rs
+++ b/src/libcollections/lib.rs
@ -40,6 +40,7 @@
 #![feature(str_char)]
 #![feature(slice_patterns)]
 #![feature(debug_builders)]
+#![feature(utf8_error)]
 #![cfg_attr(test, feature(rand, rustc_private, test, hash, collections))]
 #![cfg_attr(test, allow(deprecated))] // rand

--- a/src/libcollections/string.rs
+++ b/src/libcollections/string.rs
@ -132,7 +132,7 @@ impl String {
    ///
    /// let invalid_vec = vec![240, 144, 128];
    /// let s = String::from_utf8(invalid_vec).err().unwrap();
-    /// assert_eq!(s.utf8_error(), Utf8Error::TooShort);
+    /// let err = s.utf8_error();
    /// assert_eq!(s.into_bytes(), [240, 144, 128]);
    /// ```
    #[inline]
@ -156,14 +156,10 @@ impl String {
    /// ```
    #[stable(feature = "rust1", since = "1.0.0")]
    pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> Cow<'a, str> {
-        let mut i = 0;
+        let mut i;
        match str::from_utf8(v) {
            Ok(s) => return Cow::Borrowed(s),
-            Err(e) => {
-                if let Utf8Error::InvalidByte(firstbad) = e {
-                    i = firstbad;
-                }
-            }
+            Err(e) => i = e.valid_up_to(),
        }

        const TAG_CONT_U8: u8 = 128;
@ -188,9 +184,9 @@ impl String {
            };
        }

-        // subseqidx is the index of the first byte of the subsequence we're looking at.
-        // It's used to copy a bunch of contiguous good codepoints at once instead of copying
-        // them one by one.
+        // subseqidx is the index of the first byte of the subsequence we're
+        // looking at.  It's used to copy a bunch of contiguous good codepoints
+        // at once instead of copying them one by one.
        let mut subseqidx = i;

        while i < total {
--- a/src/libcollectionstest/str.rs
+++ b/src/libcollectionstest/str.rs
@ -1502,7 +1502,7 @@ fn test_str_from_utf8() {
    assert_eq!(from_utf8(xs), Ok("ศไทย中华Việt Nam"));

    let xs = b"hello\xFF";
-    assert_eq!(from_utf8(xs), Err(Utf8Error::TooShort));
+    assert!(from_utf8(xs).is_err());
 }

 #[test]
--- a/src/libcollectionstest/string.rs
+++ b/src/libcollectionstest/string.rs
@ -45,7 +45,6 @@ fn test_from_utf8() {

    let xs = b"hello\xFF".to_vec();
    let err = String::from_utf8(xs).err().unwrap();
-    assert_eq!(err.utf8_error(), Utf8Error::TooShort);
    assert_eq!(err.into_bytes(), b"hello\xff".to_vec());
 }

--- a/src/libcore/str/mod.rs
+++ b/src/libcore/str/mod.rs
@ -106,19 +106,19 @@ Section: Creating a string

 /// Errors which can occur when attempting to interpret a byte slice as a `str`.
 #[derive(Copy, Eq, PartialEq, Clone, Debug)]
-#[unstable(feature = "core",
-           reason = "error enumeration recently added and definitions may be refined")]
-pub enum Utf8Error {
-    /// An invalid byte was detected at the byte offset given.
-    ///
-    /// The offset is guaranteed to be in bounds of the slice in question, and
-    /// the byte at the specified offset was the first invalid byte in the
-    /// sequence detected.
-    InvalidByte(usize),
+#[stable(feature = "rust1", since = "1.0.0")]
+pub struct Utf8Error {
+    valid_up_to: usize,
+}

-    /// The byte slice was invalid because more bytes were needed but no more
-    /// bytes were available.
-    TooShort,
+impl Utf8Error {
+    /// Returns the index in the given string up to which valid UTF-8 was
+    /// verified.
+    ///
+    /// Starting at the index provided, but not necessarily at it precisely, an
+    /// invalid UTF-8 encoding sequence was found.
+    #[unstable(feature = "utf8_error", reason = "method just added")]
+    pub fn valid_up_to(&self) -> usize { self.valid_up_to }
 }

 /// Converts a slice of bytes to a string slice without performing any
@ -147,14 +147,7 @@ pub unsafe fn from_utf8_unchecked<'a>(v: &'a [u8]) -> &'a str {
 #[stable(feature = "rust1", since = "1.0.0")]
 impl fmt::Display for Utf8Error {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        match *self {
-            Utf8Error::InvalidByte(n) => {
-                write!(f, "invalid utf-8: invalid byte at index {}", n)
-            }
-            Utf8Error::TooShort => {
-                write!(f, "invalid utf-8: byte slice too short")
-            }
-        }
+        write!(f, "invalid utf-8: invalid byte near index {}", self.valid_up_to)
    }
 }

@ -1218,14 +1211,16 @@ fn run_utf8_validation_iterator(iter: &mut slice::Iter<u8>)
        // restore the iterator we had at the start of this codepoint.
        macro_rules! err { () => {{
            *iter = old.clone();
-            return Err(Utf8Error::InvalidByte(whole.len() - iter.as_slice().len()))
+            return Err(Utf8Error {
+                valid_up_to: whole.len() - iter.as_slice().len()
+            })
        }}}

        macro_rules! next { () => {
            match iter.next() {
                Some(a) => *a,
                // we needed data, but there was none: error!
-                None => return Err(Utf8Error::TooShort),
+                None => err!(),
            }
        }}

--- a/src/libstd/error.rs
+++ b/src/libstd/error.rs
@ -122,10 +122,7 @@ impl Error for str::ParseBoolError {
 #[stable(feature = "rust1", since = "1.0.0")]
 impl Error for str::Utf8Error {
    fn description(&self) -> &str {
-        match *self {
-            str::Utf8Error::TooShort => "invalid utf-8: not enough bytes",
-            str::Utf8Error::InvalidByte(..) => "invalid utf-8: corrupt contents",
-        }
+        "invalid utf-8: corrupt contents"
    }
 }