From 6066118b2988fee1b758d542cee18ffe7a84acab Mon Sep 17 00:00:00 2001
From: blake2-ppc <blake2-ppc>
Date: Thu, 15 Aug 2013 05:23:33 +0200
Subject: [PATCH] std::to_bytes: Delimit sequences &[A] and ~str when hashing

Address issue #5257, for example these values all had the same hash value:

	("aaa", "bbb", "ccc")
	("aaab", "bb", "ccc")
	("aaabbb", "", "ccc")

IterBytes for &[A] now includes the length, before calling iter_bytes on
each element.

IterBytes for &str is now terminated by a byte that does not appear in
UTF-8. This way only one more byte is processed when hashing strings.
---
 src/libstd/hash.rs      | 10 +++++-
 src/libstd/str/ascii.rs |  2 --
 src/libstd/to_bytes.rs  | 77 ++++++++++++++++++++---------------------
 3 files changed, 47 insertions(+), 42 deletions(-)
diff --git a/src/libstd/hash.rs b/src/libstd/hash.rs
index c9d031ed1b1..1af885100cd 100644
--- a/src/libstd/hash.rs
+++ b/src/libstd/hash.rs
@@ -409,6 +409,14 @@ mod tests {
 
     use uint;
 
+    // Hash just the bytes of the slice, without length prefix
+    struct Bytes<'self>(&'self [u8]);
+    impl<'self> IterBytes for Bytes<'self> {
+        fn iter_bytes(&self, _lsb0: bool, f: &fn(&[u8]) -> bool) -> bool {
+            f(**self)
+        }
+    }
+
     #[test]
     fn test_siphash() {
         let vecs : [[u8, ..8], ..64] = [
@@ -496,7 +504,7 @@ mod tests {
         while t < 64 {
             debug!("siphash test %?", t);
             let vec = u8to64_le!(vecs[t], 0);
-            let out = buf.hash_keyed(k0, k1);
+            let out = Bytes(buf.as_slice()).hash_keyed(k0, k1);
             debug!("got %?, expected %?", out, vec);
             assert_eq!(vec, out);
 
diff --git a/src/libstd/str/ascii.rs b/src/libstd/str/ascii.rs
index e0068f5e53e..1cfbf841537 100644
--- a/src/libstd/str/ascii.rs
+++ b/src/libstd/str/ascii.rs
@@ -376,7 +376,6 @@ static ASCII_UPPER_MAP: &'static [u8] = &[
 #[cfg(test)]
 mod tests {
     use super::*;
-    use to_bytes::ToBytes;
     use str::from_char;
 
     macro_rules! v2ascii (
@@ -445,7 +444,6 @@ mod tests {
 
     #[test]
     fn test_ascii_to_bytes() {
-        assert_eq!(v2ascii!(~[40, 32, 59]).to_bytes(false), ~[40u8, 32u8, 59u8]);
         assert_eq!(v2ascii!(~[40, 32, 59]).into_bytes(), ~[40u8, 32u8, 59u8]);
     }
 
diff --git a/src/libstd/to_bytes.rs b/src/libstd/to_bytes.rs
index 4d84b6d251d..a831c97438a 100644
--- a/src/libstd/to_bytes.rs
+++ b/src/libstd/to_bytes.rs
@@ -15,37 +15,43 @@ The `ToBytes` and `IterBytes` traits
 */
 
 use cast;
+use container::Container;
 use io;
 use io::Writer;
 use iterator::Iterator;
 use option::{None, Option, Some};
-use str::StrSlice;
-use vec::ImmutableVector;
+use str::{Str, StrSlice};
+use vec::{Vector, ImmutableVector};
 
 pub type Cb<'self> = &'self fn(buf: &[u8]) -> bool;
 
-/**
- * A trait to implement in order to make a type hashable;
- * This works in combination with the trait `Hash::Hash`, and
- * may in the future be merged with that trait or otherwise
- * modified when default methods and trait inheritance are
- * completed.
- */
+///
+/// A trait to implement in order to make a type hashable;
+/// This works in combination with the trait `std::hash::Hash`, and
+/// may in the future be merged with that trait or otherwise
+/// modified when default methods and trait inheritance are
+/// completed.
+///
+/// IterBytes should be implemented so that the extent of the
+/// produced byte stream can be discovered, given the original
+/// type.
+/// For example, the IterBytes implementation for vectors emits
+/// its length first, and enums should emit their discriminant.
+///
 pub trait IterBytes {
-    /**
-     * Call the provided callback `f` one or more times with
-     * byte-slices that should be used when computing a hash
-     * value or otherwise "flattening" the structure into
-     * a sequence of bytes. The `lsb0` parameter conveys
-     * whether the caller is asking for little-endian bytes
-     * (`true`) or big-endian (`false`); this should only be
-     * relevant in implementations that represent a single
-     * multi-byte datum such as a 32 bit integer or 64 bit
-     * floating-point value. It can be safely ignored for
-     * larger structured types as they are usually processed
-     * left-to-right in declaration order, regardless of
-     * underlying memory endianness.
-     */
+    /// Call the provided callback `f` one or more times with
+    /// byte-slices that should be used when computing a hash
+    /// value or otherwise "flattening" the structure into
+    /// a sequence of bytes. The `lsb0` parameter conveys
+    /// whether the caller is asking for little-endian bytes
+    /// (`true`) or big-endian (`false`); this should only be
+    /// relevant in implementations that represent a single
+    /// multi-byte datum such as a 32 bit integer or 64 bit
+    /// floating-point value. It can be safely ignored for
+    /// larger structured types as they are usually processed
+    /// left-to-right in declaration order, regardless of
+    /// underlying memory endianness.
+    ///
     fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool;
 }
 
@@ -224,6 +230,7 @@ impl IterBytes for f64 {
 impl<'self,A:IterBytes> IterBytes for &'self [A] {
     #[inline]
     fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
+        self.len().iter_bytes(lsb0, |b| f(b)) &&
         self.iter().advance(|elt| elt.iter_bytes(lsb0, |b| f(b)))
     }
 }
@@ -251,47 +258,39 @@ impl<A:IterBytes,B:IterBytes,C:IterBytes> IterBytes for (A,B,C) {
   }
 }
 
-// Move this to vec, probably.
-fn borrow<'x,A>(a: &'x [A]) -> &'x [A] {
-    a
-}
-
 impl<A:IterBytes> IterBytes for ~[A] {
     #[inline]
     fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
-        borrow(*self).iter_bytes(lsb0, f)
+        self.as_slice().iter_bytes(lsb0, f)
     }
 }
 
 impl<A:IterBytes> IterBytes for @[A] {
     #[inline]
     fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
-        borrow(*self).iter_bytes(lsb0, f)
+        self.as_slice().iter_bytes(lsb0, f)
     }
 }
 
 impl<'self> IterBytes for &'self str {
     #[inline]
     fn iter_bytes(&self, _lsb0: bool, f: Cb) -> bool {
-        f(self.as_bytes())
+        // Terminate the string with a byte that does not appear in UTF-8
+        f(self.as_bytes()) && f([0xFF])
     }
 }
 
 impl IterBytes for ~str {
     #[inline]
-    fn iter_bytes(&self, _lsb0: bool, f: Cb) -> bool {
-        // this should possibly include the null terminator, but that
-        // breaks .find_equiv on hashmaps.
-        f(self.as_bytes())
+    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
+        self.as_slice().iter_bytes(lsb0, f)
     }
 }
 
 impl IterBytes for @str {
     #[inline]
-    fn iter_bytes(&self, _lsb0: bool, f: Cb) -> bool {
-        // this should possibly include the null terminator, but that
-        // breaks .find_equiv on hashmaps.
-        f(self.as_bytes())
+    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
+        self.as_slice().iter_bytes(lsb0, f)
     }
 }