std::to_bytes: Delimit sequences &[A] and ~str when hashing

Address issue #5257, for example these values all had the same hash value: ("aaa", "bbb", "ccc") ("aaab", "bb", "ccc") ("aaabbb", "", "ccc") IterBytes for &[A] now includes the length, before calling iter_bytes on each element. IterBytes for &str is now terminated by a byte that does not appear in UTF-8. This way only one more byte is processed when hashing strings.
2013-08-15 05:23:33 +02:00 · 2013-08-15 05:23:33 +02:00 · 6066118b29
commit 6066118b29
parent 0a238288d3
3 changed files with 47 additions and 42 deletions
--- a/src/libstd/hash.rs
+++ b/src/libstd/hash.rs
@ -409,6 +409,14 @@ mod tests {

    use uint;

+    // Hash just the bytes of the slice, without length prefix
+    struct Bytes<'self>(&'self [u8]);
+    impl<'self> IterBytes for Bytes<'self> {
+        fn iter_bytes(&self, _lsb0: bool, f: &fn(&[u8]) -> bool) -> bool {
+            f(**self)
+        }
+    }
+
    #[test]
    fn test_siphash() {
        let vecs : [[u8, ..8], ..64] = [
@ -496,7 +504,7 @@ mod tests {
        while t < 64 {
            debug!("siphash test %?", t);
            let vec = u8to64_le!(vecs[t], 0);
-            let out = buf.hash_keyed(k0, k1);
+            let out = Bytes(buf.as_slice()).hash_keyed(k0, k1);
            debug!("got %?, expected %?", out, vec);
            assert_eq!(vec, out);

--- a/src/libstd/str/ascii.rs
+++ b/src/libstd/str/ascii.rs
@ -376,7 +376,6 @@ static ASCII_UPPER_MAP: &'static [u8] = &[
 #[cfg(test)]
 mod tests {
    use super::*;
-    use to_bytes::ToBytes;
    use str::from_char;

    macro_rules! v2ascii (
@ -445,7 +444,6 @@ mod tests {

    #[test]
    fn test_ascii_to_bytes() {
-        assert_eq!(v2ascii!(~[40, 32, 59]).to_bytes(false), ~[40u8, 32u8, 59u8]);
        assert_eq!(v2ascii!(~[40, 32, 59]).into_bytes(), ~[40u8, 32u8, 59u8]);
    }

--- a/src/libstd/to_bytes.rs
+++ b/src/libstd/to_bytes.rs
@ -15,37 +15,43 @@ The `ToBytes` and `IterBytes` traits
 */

 use cast;
+use container::Container;
 use io;
 use io::Writer;
 use iterator::Iterator;
 use option::{None, Option, Some};
-use str::StrSlice;
-use vec::ImmutableVector;
+use str::{Str, StrSlice};
+use vec::{Vector, ImmutableVector};

 pub type Cb<'self> = &'self fn(buf: &[u8]) -> bool;

-/**
- * A trait to implement in order to make a type hashable;
- * This works in combination with the trait `Hash::Hash`, and
- * may in the future be merged with that trait or otherwise
- * modified when default methods and trait inheritance are
- * completed.
- */
+///
+/// A trait to implement in order to make a type hashable;
+/// This works in combination with the trait `std::hash::Hash`, and
+/// may in the future be merged with that trait or otherwise
+/// modified when default methods and trait inheritance are
+/// completed.
+///
+/// IterBytes should be implemented so that the extent of the
+/// produced byte stream can be discovered, given the original
+/// type.
+/// For example, the IterBytes implementation for vectors emits
+/// its length first, and enums should emit their discriminant.
+///
 pub trait IterBytes {
-    /**
-     * Call the provided callback `f` one or more times with
-     * byte-slices that should be used when computing a hash
-     * value or otherwise "flattening" the structure into
-     * a sequence of bytes. The `lsb0` parameter conveys
-     * whether the caller is asking for little-endian bytes
-     * (`true`) or big-endian (`false`); this should only be
-     * relevant in implementations that represent a single
-     * multi-byte datum such as a 32 bit integer or 64 bit
-     * floating-point value. It can be safely ignored for
-     * larger structured types as they are usually processed
-     * left-to-right in declaration order, regardless of
-     * underlying memory endianness.
-     */
+    /// Call the provided callback `f` one or more times with
+    /// byte-slices that should be used when computing a hash
+    /// value or otherwise "flattening" the structure into
+    /// a sequence of bytes. The `lsb0` parameter conveys
+    /// whether the caller is asking for little-endian bytes
+    /// (`true`) or big-endian (`false`); this should only be
+    /// relevant in implementations that represent a single
+    /// multi-byte datum such as a 32 bit integer or 64 bit
+    /// floating-point value. It can be safely ignored for
+    /// larger structured types as they are usually processed
+    /// left-to-right in declaration order, regardless of
+    /// underlying memory endianness.
+    ///
    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool;
 }

@ -224,6 +230,7 @@ impl IterBytes for f64 {
 impl<'self,A:IterBytes> IterBytes for &'self [A] {
    #[inline]
    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
+        self.len().iter_bytes(lsb0, |b| f(b)) &&
        self.iter().advance(|elt| elt.iter_bytes(lsb0, |b| f(b)))
    }
 }
@ -251,47 +258,39 @@ impl<A:IterBytes,B:IterBytes,C:IterBytes> IterBytes for (A,B,C) {
  }
 }

-// Move this to vec, probably.
-fn borrow<'x,A>(a: &'x [A]) -> &'x [A] {
-    a
-}
-
 impl<A:IterBytes> IterBytes for ~[A] {
    #[inline]
    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
-        borrow(*self).iter_bytes(lsb0, f)
+        self.as_slice().iter_bytes(lsb0, f)
    }
 }

 impl<A:IterBytes> IterBytes for @[A] {
    #[inline]
    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
-        borrow(*self).iter_bytes(lsb0, f)
+        self.as_slice().iter_bytes(lsb0, f)
    }
 }

 impl<'self> IterBytes for &'self str {
    #[inline]
    fn iter_bytes(&self, _lsb0: bool, f: Cb) -> bool {
-        f(self.as_bytes())
+        // Terminate the string with a byte that does not appear in UTF-8
+        f(self.as_bytes()) && f([0xFF])
    }
 }

 impl IterBytes for ~str {
    #[inline]
-    fn iter_bytes(&self, _lsb0: bool, f: Cb) -> bool {
-        // this should possibly include the null terminator, but that
-        // breaks .find_equiv on hashmaps.
-        f(self.as_bytes())
+    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
+        self.as_slice().iter_bytes(lsb0, f)
    }
 }

 impl IterBytes for @str {
    #[inline]
-    fn iter_bytes(&self, _lsb0: bool, f: Cb) -> bool {
-        // this should possibly include the null terminator, but that
-        // breaks .find_equiv on hashmaps.
-        f(self.as_bytes())
+    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
+        self.as_slice().iter_bytes(lsb0, f)
    }
 }